From 70cb0f65aab09d9906b6ad257b9ce2f4dedbcdc4 Mon Sep 17 00:00:00 2001 From: miemie2013 <53960695+miemie2013@users.noreply.github.com> Date: Fri, 22 Jul 2022 09:55:22 +0800 Subject: [PATCH 01/18] Add DeformableConv2D --- src/CMakeLists.txt | 1 + src/layer/deformableconv2d.cpp | 269 +++++++++++++++++++++++++++++++++ src/layer/deformableconv2d.h | 63 ++++++++ 3 files changed, 333 insertions(+) create mode 100644 src/layer/deformableconv2d.cpp create mode 100644 src/layer/deformableconv2d.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a5582fc8b96..19675780cb4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -71,6 +71,7 @@ ncnn_add_layer(Concat) ncnn_add_layer(Convolution) ncnn_add_layer(Crop) ncnn_add_layer(Deconvolution) +ncnn_add_layer(DeformableConv2D) ncnn_add_layer(Dropout) ncnn_add_layer(Eltwise) ncnn_add_layer(ELU) diff --git a/src/layer/deformableconv2d.cpp b/src/layer/deformableconv2d.cpp new file mode 100644 index 00000000000..6d4b7434d06 --- /dev/null +++ b/src/layer/deformableconv2d.cpp @@ -0,0 +1,269 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "deformableconv2d.h" + +#include "layer_type.h" + +namespace ncnn { + +DeformableConv2D::DeformableConv2D() +{ + one_blob_only = false; + support_inplace = false; +} + +int DeformableConv2D::load_param(const ParamDict& pd) +{ + num_output = pd.get(0, 0); + kernel_w = pd.get(1, 0); + kernel_h = pd.get(11, kernel_w); + dilation_w = pd.get(2, 1); + dilation_h = pd.get(12, dilation_w); + stride_w = pd.get(3, 1); + stride_h = pd.get(13, stride_w); + pad_left = pd.get(4, 0); + pad_right = pd.get(15, pad_left); + pad_top = pd.get(14, pad_left); + pad_bottom = pd.get(16, pad_top); + bias_term = pd.get(5, 0); + weight_data_size = pd.get(6, 0); + activation_type = pd.get(9, 0); + activation_params = pd.get(10, Mat()); + return 0; +} + +int DeformableConv2D::load_model(const ModelBin& mb) +{ + weight_data = mb.load(weight_data_size, 0); + if (weight_data.empty()) + return -100; + + if (bias_term) + { + bias_data = mb.load(num_output, 1); + if (bias_data.empty()) + return -100; + } + + const int in_c = weight_data_size / (num_output * kernel_h * kernel_w); + weight_data = weight_data.reshape(kernel_w * kernel_h, in_c, num_output); + weight_data_t.create(in_c, kernel_w * kernel_h, num_output); + if (weight_data_t.empty()) + return -100; + for (int q = 0; q < num_output; q++) + { + const Mat m = weight_data.channel(q); + float* outptr = weight_data_t.channel(q); + + for (int i = 0; i < kernel_w * kernel_h; i++) + { + for (int j = 0; j < in_c; j++) + { + *outptr++ = m.row(j)[i]; + } + } + } + weight_data_t = weight_data_t.reshape(in_c * kernel_w * kernel_h, num_output); + return 0; +} + +int DeformableConv2D::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const Mat& mask = bottom_blobs[2]; + + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int in_c = bottom_blob.c; + const size_t elemsize = bottom_blob.elemsize; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + const int out_w = (w + pad_left + pad_right - kernel_extent_w) / stride_w + 1; + const int out_h = (h + pad_top + pad_bottom - kernel_extent_h) / stride_h + 1; + + // output = im2col matmul weight_t, im2col.shape is [out_h * out_w, kernel_h * kernel_w * in_c] (in python), + // weight_t.shape is [num_output, kernel_h * kernel_w * in_c] (in python), + // output.shape is [out_h * out_w, num_output] (in python). + Mat im2col; + im2col.create(kernel_h * kernel_w * in_c * out_h * out_w, elemsize, opt.blob_allocator); + if (im2col.empty()) + return -100; + + Mat& output = top_blobs[0]; + output.create(num_output, out_h * out_w, elemsize, opt.blob_allocator); + if (output.empty()) + return -100; + + Mat bottom_blob_flatten = bottom_blob.reshape(w * h * in_c); + Mat offset_flatten = offset.reshape(offset.w * offset.h * offset.c); + Mat mask_flatten = mask.reshape(mask.w * mask.h * mask.c); + const float* data_im_ptr = bottom_blob_flatten; + const float* data_offset_ptr = offset_flatten; + const float* data_mask_ptr = mask_flatten; + float* im2col_ptr = im2col; + + // im2col + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < out_h; h_col++) + { + for (int w_col = 0; w_col < out_w; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + float* data_col_ptr = im2col_ptr + (h_col * out_w + w_col) * kernel_h * kernel_w * in_c; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + const int data_offset_h_ptr = (((i * kernel_w + j) * 2) * out_h + h_col) * out_w + w_col; + const int data_offset_w_ptr = (((i * kernel_w + j) * 2 + 1) * out_h + h_col) * out_w + w_col; + const int data_mask_hw_ptr = ((i * kernel_w + j) * out_h + h_col) * out_w + w_col; + + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask_ = data_mask_ptr[data_mask_hw_ptr]; + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) { + v1_pos = h_low * w + w_low; + } + if (v2_cond) { + v2_pos = h_low * w + w_high; + } + if (v3_cond) { + v3_pos = h_high * w + w_low; + } + if (v4_cond) { + v4_pos = h_high * w + w_high; + } + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + const float* data_im_channel_ptr = data_im_ptr; + for (int c_im = 0; c_im < in_c; c_im++) + { + float val = 0.f; + if (cond) { + float v1 = 0.f; + if (v1_cond) { + v1 = data_im_channel_ptr[v1_pos]; + } + float v2 = 0.f; + if (v2_cond) { + v2 = data_im_channel_ptr[v2_pos]; + } + float v3 = 0.f; + if (v3_cond) { + v3 = data_im_channel_ptr[v3_pos]; + } + float v4 = 0.f; + if (v4_cond) { + v4 = data_im_channel_ptr[v4_pos]; + } + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + *data_col_ptr = val * mask_; + data_col_ptr += 1; + data_im_channel_ptr += h*w; + } + } + } + } + } + im2col = im2col.reshape(kernel_h * kernel_w * in_c, out_h * out_w); + + // call InnerProduct + ncnn::Layer* innerProduct = ncnn::create_layer(ncnn::LayerType::InnerProduct); + + // set param + ncnn::ParamDict pd; + pd.set(0, num_output); + pd.set(1, bias_term); + pd.set(2, weight_data_size); + pd.set(9, activation_type); + pd.set(10, activation_params); + innerProduct->load_param(pd); + + // set weights + ncnn::Mat weights[2]; + weights[0] = weight_data_t; + if (bias_term) + { + weights[1] = bias_data; + } + innerProduct->load_model(ncnn::ModelBinFromMatArray(weights)); + innerProduct->create_pipeline(opt); + + // forward + innerProduct->forward(im2col, output, opt); + innerProduct->destroy_pipeline(opt); + delete innerProduct; + + ncnn::Mat output_t; + // call Permute + ncnn::Layer* permute = ncnn::create_layer(ncnn::LayerType::Permute); + + // set param + ncnn::ParamDict permute_pd; + permute_pd.set(0, 1); + permute->load_param(permute_pd); + permute->create_pipeline(opt); + // forward + permute->forward(output, output_t, opt); + permute->destroy_pipeline(opt); + delete permute; + output_t = output_t.reshape(out_w, out_h, num_output); + top_blobs[0] = output_t; + return 0; +} + +} // namespace ncnn diff --git a/src/layer/deformableconv2d.h b/src/layer/deformableconv2d.h new file mode 100644 index 00000000000..8e3b5c247ba --- /dev/null +++ b/src/layer/deformableconv2d.h @@ -0,0 +1,63 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_DEFORMABLECONV2D_H +#define LAYER_DEFORMABLECONV2D_H + +#include "layer.h" + +namespace ncnn { + +class DeformableConv2D : public Layer +{ +public: + DeformableConv2D(); + + virtual int load_param(const ParamDict& pd); + + virtual int load_model(const ModelBin& mb); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + +public: + // param + int num_output; + int kernel_w; + int kernel_h; + int dilation_w; + int dilation_h; + int stride_w; + int stride_h; + int pad_left; // -233=SAME_UPPER -234=SAME_LOWER + int pad_right; + int pad_top; + int pad_bottom; + int bias_term; + + int weight_data_size; + + // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid + int activation_type; + Mat activation_params; + + // model + Mat weight_data; + Mat bias_data; + + Mat weight_data_t; +}; + +} // namespace ncnn + +#endif // LAYER_DEFORMABLECONV2D_H From 99b2dfc728621a0c9234a41e1802e24c2a3a485f Mon Sep 17 00:00:00 2001 From: miemie2013 <53960695+miemie2013@users.noreply.github.com> Date: Fri, 22 Jul 2022 16:09:00 +0800 Subject: [PATCH 02/18] Add DeformableConv2D Add DeformableConv2D --- src/layer/deformableconv2d.cpp | 247 ++++++++++-------------- src/layer/deformableconv2d.h | 2 - src/layer/x86/deformableconv2d_x86.cpp | 249 +++++++++++++++++++++++++ src/layer/x86/deformableconv2d_x86.h | 37 ++++ 4 files changed, 381 insertions(+), 154 deletions(-) create mode 100644 src/layer/x86/deformableconv2d_x86.cpp create mode 100644 src/layer/x86/deformableconv2d_x86.h diff --git a/src/layer/deformableconv2d.cpp b/src/layer/deformableconv2d.cpp index 6d4b7434d06..c35134b9a88 100644 --- a/src/layer/deformableconv2d.cpp +++ b/src/layer/deformableconv2d.cpp @@ -14,7 +14,7 @@ #include "deformableconv2d.h" -#include "layer_type.h" +#include "fused_activation.h" namespace ncnn { @@ -56,26 +56,6 @@ int DeformableConv2D::load_model(const ModelBin& mb) if (bias_data.empty()) return -100; } - - const int in_c = weight_data_size / (num_output * kernel_h * kernel_w); - weight_data = weight_data.reshape(kernel_w * kernel_h, in_c, num_output); - weight_data_t.create(in_c, kernel_w * kernel_h, num_output); - if (weight_data_t.empty()) - return -100; - for (int q = 0; q < num_output; q++) - { - const Mat m = weight_data.channel(q); - float* outptr = weight_data_t.channel(q); - - for (int i = 0; i < kernel_w * kernel_h; i++) - { - for (int j = 0; j < in_c; j++) - { - *outptr++ = m.row(j)[i]; - } - } - } - weight_data_t = weight_data_t.reshape(in_c * kernel_w * kernel_h, num_output); return 0; } @@ -96,28 +76,26 @@ int DeformableConv2D::forward(const std::vector& bottom_blobs, std::vector< const int out_w = (w + pad_left + pad_right - kernel_extent_w) / stride_w + 1; const int out_h = (h + pad_top + pad_bottom - kernel_extent_h) / stride_h + 1; - // output = im2col matmul weight_t, im2col.shape is [out_h * out_w, kernel_h * kernel_w * in_c] (in python), - // weight_t.shape is [num_output, kernel_h * kernel_w * in_c] (in python), - // output.shape is [out_h * out_w, num_output] (in python). - Mat im2col; - im2col.create(kernel_h * kernel_w * in_c * out_h * out_w, elemsize, opt.blob_allocator); - if (im2col.empty()) - return -100; - + // output.shape is [num_output, out_h, out_w] (in python). Mat& output = top_blobs[0]; - output.create(num_output, out_h * out_w, elemsize, opt.blob_allocator); + output.create(out_w * out_h * num_output, elemsize, opt.blob_allocator); if (output.empty()) return -100; + // bottom_blob.shape is [in_c, h, w] (in python). Mat bottom_blob_flatten = bottom_blob.reshape(w * h * in_c); Mat offset_flatten = offset.reshape(offset.w * offset.h * offset.c); Mat mask_flatten = mask.reshape(mask.w * mask.h * mask.c); const float* data_im_ptr = bottom_blob_flatten; const float* data_offset_ptr = offset_flatten; const float* data_mask_ptr = mask_flatten; - float* im2col_ptr = im2col; + const float* weight_ptr = weight_data; + const float* bias_ptr = weight_data; + if (bias_term) + bias_ptr = bias_data; + float* output_ptr = output; - // im2col + // deformable conv #pragma omp parallel for num_threads(opt.num_threads) for (int h_col = 0; h_col < out_h; h_col++) { @@ -125,144 +103,109 @@ int DeformableConv2D::forward(const std::vector& bottom_blobs, std::vector< { int h_in = h_col * stride_h - pad_top; int w_in = w_col * stride_w - pad_left; - float* data_col_ptr = im2col_ptr + (h_col * out_w + w_col) * kernel_h * kernel_w * in_c; - for (int i = 0; i < kernel_h; i++) + float* output_hw_ptr = output_ptr + (h_col * out_w + w_col); + for (int oc = 0; oc < num_output; oc++) { - for (int j = 0; j < kernel_w; j++) + float sum = 0.f; + if (bias_term) + sum = bias_ptr[oc]; + for (int i = 0; i < kernel_h; i++) { - const int data_offset_h_ptr = (((i * kernel_w + j) * 2) * out_h + h_col) * out_w + w_col; - const int data_offset_w_ptr = (((i * kernel_w + j) * 2 + 1) * out_h + h_col) * out_w + w_col; - const int data_mask_hw_ptr = ((i * kernel_w + j) * out_h + h_col) * out_w + w_col; - - const float offset_h = data_offset_ptr[data_offset_h_ptr]; - const float offset_w = data_offset_ptr[data_offset_w_ptr]; - const float mask_ = data_mask_ptr[data_mask_hw_ptr]; - const float h_im = h_in + i * dilation_h + offset_h; - const float w_im = w_in + j * dilation_w + offset_w; - - // Bilinear - const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; - float w1 = 0.f; - float w2 = 0.f; - float w3 = 0.f; - float w4 = 0.f; - bool v1_cond = false; - bool v2_cond = false; - bool v3_cond = false; - bool v4_cond = false; - int v1_pos = 0; - int v2_pos = 0; - int v3_pos = 0; - int v4_pos = 0; - if (cond) { - int h_low = floor(h_im); - int w_low = floor(w_im); - int h_high = h_low + 1; - int w_high = w_low + 1; - - float lh = h_im - h_low; - float lw = w_im - w_low; - float hh = 1 - lh; - float hw = 1 - lw; - - v1_cond = (h_low >= 0 && w_low >= 0); - v2_cond = (h_low >= 0 && w_high <= w - 1); - v3_cond = (h_high <= h - 1 && w_low >= 0); - v4_cond = (h_high <= h - 1 && w_high <= w - 1); - if (v1_cond) { - v1_pos = h_low * w + w_low; - } - if (v2_cond) { - v2_pos = h_low * w + w_high; - } - if (v3_cond) { - v3_pos = h_high * w + w_low; - } - if (v4_cond) { - v4_pos = h_high * w + w_high; - } - - w1 = hh * hw; - w2 = hh * lw; - w3 = lh * hw; - w4 = lh * lw; - } - - const float* data_im_channel_ptr = data_im_ptr; - for (int c_im = 0; c_im < in_c; c_im++) + for (int j = 0; j < kernel_w; j++) { - float val = 0.f; + const int data_offset_h_ptr = (((i * kernel_w + j) * 2) * out_h + h_col) * out_w + w_col; + const int data_offset_w_ptr = (((i * kernel_w + j) * 2 + 1) * out_h + h_col) * out_w + w_col; + const int data_mask_hw_ptr = ((i * kernel_w + j) * out_h + h_col) * out_w + w_col; + + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask_ = data_mask_ptr[data_mask_hw_ptr]; + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; if (cond) { - float v1 = 0.f; + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); if (v1_cond) { - v1 = data_im_channel_ptr[v1_pos]; + v1_pos = h_low * w + w_low; } - float v2 = 0.f; if (v2_cond) { - v2 = data_im_channel_ptr[v2_pos]; + v2_pos = h_low * w + w_high; } - float v3 = 0.f; if (v3_cond) { - v3 = data_im_channel_ptr[v3_pos]; + v3_pos = h_high * w + w_low; } - float v4 = 0.f; if (v4_cond) { - v4 = data_im_channel_ptr[v4_pos]; + v4_pos = h_high * w + w_high; + } + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + const float* data_im_channel_ptr = data_im_ptr; + for (int c_im = 0; c_im < in_c; c_im++) + { + float val = 0.f; + if (cond) { + float v1 = 0.f; + if (v1_cond) { + v1 = data_im_channel_ptr[v1_pos]; + } + float v2 = 0.f; + if (v2_cond) { + v2 = data_im_channel_ptr[v2_pos]; + } + float v3 = 0.f; + if (v3_cond) { + v3 = data_im_channel_ptr[v3_pos]; + } + float v4 = 0.f; + if (v4_cond) { + v4 = data_im_channel_ptr[v4_pos]; + } + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; } - val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + sum += val * mask_ * weight_ptr[((oc * in_c + c_im) * kernel_h + i) * kernel_w + j]; + data_im_channel_ptr += h*w; } - *data_col_ptr = val * mask_; - data_col_ptr += 1; - data_im_channel_ptr += h*w; } } + *output_hw_ptr = activation_ss(sum, activation_type, activation_params); + output_hw_ptr += out_h * out_w; } } } - im2col = im2col.reshape(kernel_h * kernel_w * in_c, out_h * out_w); - - // call InnerProduct - ncnn::Layer* innerProduct = ncnn::create_layer(ncnn::LayerType::InnerProduct); - - // set param - ncnn::ParamDict pd; - pd.set(0, num_output); - pd.set(1, bias_term); - pd.set(2, weight_data_size); - pd.set(9, activation_type); - pd.set(10, activation_params); - innerProduct->load_param(pd); - - // set weights - ncnn::Mat weights[2]; - weights[0] = weight_data_t; - if (bias_term) - { - weights[1] = bias_data; - } - innerProduct->load_model(ncnn::ModelBinFromMatArray(weights)); - innerProduct->create_pipeline(opt); - - // forward - innerProduct->forward(im2col, output, opt); - innerProduct->destroy_pipeline(opt); - delete innerProduct; - - ncnn::Mat output_t; - // call Permute - ncnn::Layer* permute = ncnn::create_layer(ncnn::LayerType::Permute); - - // set param - ncnn::ParamDict permute_pd; - permute_pd.set(0, 1); - permute->load_param(permute_pd); - permute->create_pipeline(opt); - // forward - permute->forward(output, output_t, opt); - permute->destroy_pipeline(opt); - delete permute; - output_t = output_t.reshape(out_w, out_h, num_output); - top_blobs[0] = output_t; + output = output.reshape(out_w, out_h, num_output); + top_blobs[0] = output; return 0; } diff --git a/src/layer/deformableconv2d.h b/src/layer/deformableconv2d.h index 8e3b5c247ba..d7315fc3aab 100644 --- a/src/layer/deformableconv2d.h +++ b/src/layer/deformableconv2d.h @@ -54,8 +54,6 @@ class DeformableConv2D : public Layer // model Mat weight_data; Mat bias_data; - - Mat weight_data_t; }; } // namespace ncnn diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp new file mode 100644 index 00000000000..5e91a45b19e --- /dev/null +++ b/src/layer/x86/deformableconv2d_x86.cpp @@ -0,0 +1,249 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "deformableconv2d_x86.h" + +#include "layer_type.h" + +namespace ncnn { + +DeformableConv2D_x86::DeformableConv2D_x86() +{ + one_blob_only = false; + support_inplace = false; +} + +int DeformableConv2D_x86::load_model(const ModelBin& mb) +{ + weight_data = mb.load(weight_data_size, 0); + if (weight_data.empty()) + return -100; + + if (bias_term) + { + bias_data = mb.load(num_output, 1); + if (bias_data.empty()) + return -100; + } + + const int in_c = weight_data_size / (num_output * kernel_h * kernel_w); + weight_data = weight_data.reshape(kernel_w * kernel_h, in_c, num_output); + weight_data_t.create(in_c, kernel_w * kernel_h, num_output); + if (weight_data_t.empty()) + return -100; + for (int q = 0; q < num_output; q++) + { + const Mat m = weight_data.channel(q); + float* outptr = weight_data_t.channel(q); + + for (int i = 0; i < kernel_w * kernel_h; i++) + { + for (int j = 0; j < in_c; j++) + { + *outptr++ = m.row(j)[i]; + } + } + } + weight_data_t = weight_data_t.reshape(in_c * kernel_w * kernel_h, num_output); + return 0; +} + +int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const Mat& mask = bottom_blobs[2]; + + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int in_c = bottom_blob.c; + const size_t elemsize = bottom_blob.elemsize; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + const int out_w = (w + pad_left + pad_right - kernel_extent_w) / stride_w + 1; + const int out_h = (h + pad_top + pad_bottom - kernel_extent_h) / stride_h + 1; + + // output = im2col matmul weight_t, im2col.shape is [out_h * out_w, kernel_h * kernel_w * in_c] (in python), + // weight_t.shape is [num_output, kernel_h * kernel_w * in_c] (in python), + // output.shape is [out_h * out_w, num_output] (in python). + Mat im2col; + im2col.create(kernel_h * kernel_w * in_c * out_h * out_w, elemsize, opt.blob_allocator); + if (im2col.empty()) + return -100; + + Mat& output = top_blobs[0]; + output.create(num_output, out_h * out_w, elemsize, opt.blob_allocator); + if (output.empty()) + return -100; + + Mat bottom_blob_flatten = bottom_blob.reshape(w * h * in_c); + Mat offset_flatten = offset.reshape(offset.w * offset.h * offset.c); + Mat mask_flatten = mask.reshape(mask.w * mask.h * mask.c); + const float* data_im_ptr = bottom_blob_flatten; + const float* data_offset_ptr = offset_flatten; + const float* data_mask_ptr = mask_flatten; + float* im2col_ptr = im2col; + + // im2col + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < out_h; h_col++) + { + for (int w_col = 0; w_col < out_w; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + float* data_col_ptr = im2col_ptr + (h_col * out_w + w_col) * kernel_h * kernel_w * in_c; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + const int data_offset_h_ptr = (((i * kernel_w + j) * 2) * out_h + h_col) * out_w + w_col; + const int data_offset_w_ptr = (((i * kernel_w + j) * 2 + 1) * out_h + h_col) * out_w + w_col; + const int data_mask_hw_ptr = ((i * kernel_w + j) * out_h + h_col) * out_w + w_col; + + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask_ = data_mask_ptr[data_mask_hw_ptr]; + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) { + v1_pos = h_low * w + w_low; + } + if (v2_cond) { + v2_pos = h_low * w + w_high; + } + if (v3_cond) { + v3_pos = h_high * w + w_low; + } + if (v4_cond) { + v4_pos = h_high * w + w_high; + } + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + const float* data_im_channel_ptr = data_im_ptr; + for (int c_im = 0; c_im < in_c; c_im++) + { + float val = 0.f; + if (cond) { + float v1 = 0.f; + if (v1_cond) { + v1 = data_im_channel_ptr[v1_pos]; + } + float v2 = 0.f; + if (v2_cond) { + v2 = data_im_channel_ptr[v2_pos]; + } + float v3 = 0.f; + if (v3_cond) { + v3 = data_im_channel_ptr[v3_pos]; + } + float v4 = 0.f; + if (v4_cond) { + v4 = data_im_channel_ptr[v4_pos]; + } + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + *data_col_ptr = val * mask_; + data_col_ptr += 1; + data_im_channel_ptr += h*w; + } + } + } + } + } + im2col = im2col.reshape(kernel_h * kernel_w * in_c, out_h * out_w); + + // call InnerProduct + ncnn::Layer* innerProduct = ncnn::create_layer(ncnn::LayerType::InnerProduct); + + // set param + ncnn::ParamDict pd; + pd.set(0, num_output); + pd.set(1, bias_term); + pd.set(2, weight_data_size); + pd.set(9, activation_type); + pd.set(10, activation_params); + innerProduct->load_param(pd); + + // set weights + ncnn::Mat weights[2]; + weights[0] = weight_data_t; + if (bias_term) + { + weights[1] = bias_data; + } + innerProduct->load_model(ncnn::ModelBinFromMatArray(weights)); + innerProduct->create_pipeline(opt); + + // forward + innerProduct->forward(im2col, output, opt); + innerProduct->destroy_pipeline(opt); + delete innerProduct; + + ncnn::Mat output_t; + // call Permute + ncnn::Layer* permute = ncnn::create_layer(ncnn::LayerType::Permute); + + // set param + ncnn::ParamDict permute_pd; + permute_pd.set(0, 1); + permute->load_param(permute_pd); + permute->create_pipeline(opt); + // forward + permute->forward(output, output_t, opt); + permute->destroy_pipeline(opt); + delete permute; + output_t = output_t.reshape(out_w, out_h, num_output); + top_blobs[0] = output_t; + return 0; +} + +} // namespace ncnn diff --git a/src/layer/x86/deformableconv2d_x86.h b/src/layer/x86/deformableconv2d_x86.h new file mode 100644 index 00000000000..1510207f7e6 --- /dev/null +++ b/src/layer/x86/deformableconv2d_x86.h @@ -0,0 +1,37 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_DEFORMABLECONV2D_X86_H +#define LAYER_DEFORMABLECONV2D_X86_H + +#include "deformableconv2d.h" + +namespace ncnn { + +class DeformableConv2D_x86 : virtual public DeformableConv2D +{ +public: + DeformableConv2D_x86(); + + virtual int load_model(const ModelBin& mb); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + +public: + Mat weight_data_t; +}; + +} // namespace ncnn + +#endif // LAYER_DEFORMABLECONV2D_X86_H From 4c8fdc41b5b5450a684e7e32c0a4e3fb386c31cf Mon Sep 17 00:00:00 2001 From: miemie2013 <53960695+miemie2013@users.noreply.github.com> Date: Fri, 22 Jul 2022 18:10:51 +0800 Subject: [PATCH 03/18] Add DeformableConv2D Add DeformableConv2D --- src/CMakeLists.txt | 2 +- src/layer/deformableconv2d.cpp | 72 +++++++-------------- src/layer/x86/deformableconv2d_x86.cpp | 87 +++++++++++++++----------- src/layer/x86/deformableconv2d_x86.h | 6 ++ 4 files changed, 78 insertions(+), 89 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 19675780cb4..3b1f8ddca59 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -71,7 +71,6 @@ ncnn_add_layer(Concat) ncnn_add_layer(Convolution) ncnn_add_layer(Crop) ncnn_add_layer(Deconvolution) -ncnn_add_layer(DeformableConv2D) ncnn_add_layer(Dropout) ncnn_add_layer(Eltwise) ncnn_add_layer(ELU) @@ -156,6 +155,7 @@ ncnn_add_layer(DeconvolutionDepthWise1D) ncnn_add_layer(Deconvolution3D) ncnn_add_layer(DeconvolutionDepthWise3D) ncnn_add_layer(Einsum) +ncnn_add_layer(DeformableConv2D) if(NCNN_VULKAN) ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp) diff --git a/src/layer/deformableconv2d.cpp b/src/layer/deformableconv2d.cpp index c35134b9a88..f41d0d2f145 100644 --- a/src/layer/deformableconv2d.cpp +++ b/src/layer/deformableconv2d.cpp @@ -78,22 +78,14 @@ int DeformableConv2D::forward(const std::vector& bottom_blobs, std::vector< // output.shape is [num_output, out_h, out_w] (in python). Mat& output = top_blobs[0]; - output.create(out_w * out_h * num_output, elemsize, opt.blob_allocator); + output.create(out_w, out_h, num_output, elemsize, opt.blob_allocator); if (output.empty()) return -100; - // bottom_blob.shape is [in_c, h, w] (in python). - Mat bottom_blob_flatten = bottom_blob.reshape(w * h * in_c); - Mat offset_flatten = offset.reshape(offset.w * offset.h * offset.c); - Mat mask_flatten = mask.reshape(mask.w * mask.h * mask.c); - const float* data_im_ptr = bottom_blob_flatten; - const float* data_offset_ptr = offset_flatten; - const float* data_mask_ptr = mask_flatten; const float* weight_ptr = weight_data; const float* bias_ptr = weight_data; if (bias_term) bias_ptr = bias_data; - float* output_ptr = output; // deformable conv #pragma omp parallel for num_threads(opt.num_threads) @@ -103,7 +95,6 @@ int DeformableConv2D::forward(const std::vector& bottom_blobs, std::vector< { int h_in = h_col * stride_h - pad_top; int w_in = w_col * stride_w - pad_left; - float* output_hw_ptr = output_ptr + (h_col * out_w + w_col); for (int oc = 0; oc < num_output; oc++) { float sum = 0.f; @@ -113,18 +104,18 @@ int DeformableConv2D::forward(const std::vector& bottom_blobs, std::vector< { for (int j = 0; j < kernel_w; j++) { - const int data_offset_h_ptr = (((i * kernel_w + j) * 2) * out_h + h_col) * out_w + w_col; - const int data_offset_w_ptr = (((i * kernel_w + j) * 2 + 1) * out_h + h_col) * out_w + w_col; - const int data_mask_hw_ptr = ((i * kernel_w + j) * out_h + h_col) * out_w + w_col; - - const float offset_h = data_offset_ptr[data_offset_h_ptr]; - const float offset_w = data_offset_ptr[data_offset_w_ptr]; - const float mask_ = data_mask_ptr[data_mask_hw_ptr]; + const float offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + const float offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + const float mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; const float h_im = h_in + i * dilation_h + offset_h; const float w_im = w_in + j * dilation_w + offset_w; // Bilinear const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + int h_low = 0; + int w_low = 0; + int h_high = 0; + int w_high = 0; float w1 = 0.f; float w2 = 0.f; float w3 = 0.f; @@ -138,10 +129,10 @@ int DeformableConv2D::forward(const std::vector& bottom_blobs, std::vector< int v3_pos = 0; int v4_pos = 0; if (cond) { - int h_low = floor(h_im); - int w_low = floor(w_im); - int h_high = h_low + 1; - int w_high = w_low + 1; + h_low = floor(h_im); + w_low = floor(w_im); + h_high = h_low + 1; + w_high = w_low + 1; float lh = h_im - h_low; float lw = w_im - w_low; @@ -152,18 +143,6 @@ int DeformableConv2D::forward(const std::vector& bottom_blobs, std::vector< v2_cond = (h_low >= 0 && w_high <= w - 1); v3_cond = (h_high <= h - 1 && w_low >= 0); v4_cond = (h_high <= h - 1 && w_high <= w - 1); - if (v1_cond) { - v1_pos = h_low * w + w_low; - } - if (v2_cond) { - v2_pos = h_low * w + w_high; - } - if (v3_cond) { - v3_pos = h_high * w + w_low; - } - if (v4_cond) { - v4_pos = h_high * w + w_high; - } w1 = hh * hw; w2 = hh * lw; @@ -171,41 +150,32 @@ int DeformableConv2D::forward(const std::vector& bottom_blobs, std::vector< w4 = lh * lw; } - const float* data_im_channel_ptr = data_im_ptr; for (int c_im = 0; c_im < in_c; c_im++) { float val = 0.f; if (cond) { float v1 = 0.f; - if (v1_cond) { - v1 = data_im_channel_ptr[v1_pos]; - } + if (v1_cond) + v1 = bottom_blob.channel(c_im).row(h_low)[w_low]; float v2 = 0.f; - if (v2_cond) { - v2 = data_im_channel_ptr[v2_pos]; - } + if (v2_cond) + v2 = bottom_blob.channel(c_im).row(h_low)[w_high]; float v3 = 0.f; - if (v3_cond) { - v3 = data_im_channel_ptr[v3_pos]; - } + if (v3_cond) + v3 = bottom_blob.channel(c_im).row(h_high)[w_low]; float v4 = 0.f; - if (v4_cond) { - v4 = data_im_channel_ptr[v4_pos]; - } + if (v4_cond) + v4 = bottom_blob.channel(c_im).row(h_high)[w_high]; val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; } sum += val * mask_ * weight_ptr[((oc * in_c + c_im) * kernel_h + i) * kernel_w + j]; - data_im_channel_ptr += h*w; } } } - *output_hw_ptr = activation_ss(sum, activation_type, activation_params); - output_hw_ptr += out_h * out_w; + output.channel(oc).row(h_col)[w_col] = activation_ss(sum, activation_type, activation_params); } } } - output = output.reshape(out_w, out_h, num_output); - top_blobs[0] = output; return 0; } diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp index 5e91a45b19e..036e0a40272 100644 --- a/src/layer/x86/deformableconv2d_x86.cpp +++ b/src/layer/x86/deformableconv2d_x86.cpp @@ -22,6 +22,9 @@ DeformableConv2D_x86::DeformableConv2D_x86() { one_blob_only = false; support_inplace = false; + + inner_product = 0; + permute = 0; } int DeformableConv2D_x86::load_model(const ModelBin& mb) @@ -59,6 +62,52 @@ int DeformableConv2D_x86::load_model(const ModelBin& mb) return 0; } +int DeformableConv2D_x86::create_pipeline(const Option& opt) +{ + { + inner_product = ncnn::create_layer(ncnn::LayerType::InnerProduct); + ncnn::ParamDict pd; + pd.set(0, num_output); + pd.set(1, bias_term); + pd.set(2, weight_data_size); + pd.set(9, activation_type); + pd.set(10, activation_params); + inner_product->load_param(pd); + ncnn::Mat weights[2]; + weights[0] = weight_data_t; + if (bias_term) + weights[1] = bias_data; + inner_product->load_model(ncnn::ModelBinFromMatArray(weights)); + inner_product->create_pipeline(opt); + + permute = ncnn::create_layer(ncnn::LayerType::Permute); + ncnn::ParamDict permute_pd; + permute_pd.set(0, 1); + permute->load_param(permute_pd); + permute->create_pipeline(opt); + } + + return 0; +} + +int DeformableConv2D_x86::destroy_pipeline(const Option& opt) +{ + if (inner_product) + { + inner_product->destroy_pipeline(opt); + delete inner_product; + inner_product = 0; + } + if (permute) + { + permute->destroy_pipeline(opt); + delete permute; + permute = 0; + } + + return 0; +} + int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& bottom_blob = bottom_blobs[0]; @@ -200,47 +249,11 @@ int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vec } } im2col = im2col.reshape(kernel_h * kernel_w * in_c, out_h * out_w); - // call InnerProduct - ncnn::Layer* innerProduct = ncnn::create_layer(ncnn::LayerType::InnerProduct); - - // set param - ncnn::ParamDict pd; - pd.set(0, num_output); - pd.set(1, bias_term); - pd.set(2, weight_data_size); - pd.set(9, activation_type); - pd.set(10, activation_params); - innerProduct->load_param(pd); - - // set weights - ncnn::Mat weights[2]; - weights[0] = weight_data_t; - if (bias_term) - { - weights[1] = bias_data; - } - innerProduct->load_model(ncnn::ModelBinFromMatArray(weights)); - innerProduct->create_pipeline(opt); - - // forward - innerProduct->forward(im2col, output, opt); - innerProduct->destroy_pipeline(opt); - delete innerProduct; - + inner_product->forward(im2col, output, opt); ncnn::Mat output_t; // call Permute - ncnn::Layer* permute = ncnn::create_layer(ncnn::LayerType::Permute); - - // set param - ncnn::ParamDict permute_pd; - permute_pd.set(0, 1); - permute->load_param(permute_pd); - permute->create_pipeline(opt); - // forward permute->forward(output, output_t, opt); - permute->destroy_pipeline(opt); - delete permute; output_t = output_t.reshape(out_w, out_h, num_output); top_blobs[0] = output_t; return 0; diff --git a/src/layer/x86/deformableconv2d_x86.h b/src/layer/x86/deformableconv2d_x86.h index 1510207f7e6..50dabdbeaaf 100644 --- a/src/layer/x86/deformableconv2d_x86.h +++ b/src/layer/x86/deformableconv2d_x86.h @@ -26,10 +26,16 @@ class DeformableConv2D_x86 : virtual public DeformableConv2D virtual int load_model(const ModelBin& mb); + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; public: Mat weight_data_t; + + Layer* inner_product; + Layer* permute; }; } // namespace ncnn From e6d965f33c1423d49af9a54724fd3bb468c9b28f Mon Sep 17 00:00:00 2001 From: miemie2013 <53960695+miemie2013@users.noreply.github.com> Date: Fri, 22 Jul 2022 18:22:29 +0800 Subject: [PATCH 04/18] Add DeformableConv2D Add DeformableConv2D --- src/layer/deformableconv2d.cpp | 4 ---- src/layer/x86/deformableconv2d_x86.cpp | 24 ++++++++---------------- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/src/layer/deformableconv2d.cpp b/src/layer/deformableconv2d.cpp index f41d0d2f145..46a674c6051 100644 --- a/src/layer/deformableconv2d.cpp +++ b/src/layer/deformableconv2d.cpp @@ -124,10 +124,6 @@ int DeformableConv2D::forward(const std::vector& bottom_blobs, std::vector< bool v2_cond = false; bool v3_cond = false; bool v4_cond = false; - int v1_pos = 0; - int v2_pos = 0; - int v3_pos = 0; - int v4_pos = 0; if (cond) { h_low = floor(h_im); w_low = floor(w_im); diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp index 036e0a40272..3efa3b82cea 100644 --- a/src/layer/x86/deformableconv2d_x86.cpp +++ b/src/layer/x86/deformableconv2d_x86.cpp @@ -198,18 +198,14 @@ int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vec v2_cond = (h_low >= 0 && w_high <= w - 1); v3_cond = (h_high <= h - 1 && w_low >= 0); v4_cond = (h_high <= h - 1 && w_high <= w - 1); - if (v1_cond) { + if (v1_cond) v1_pos = h_low * w + w_low; - } - if (v2_cond) { + if (v2_cond) v2_pos = h_low * w + w_high; - } - if (v3_cond) { + if (v3_cond) v3_pos = h_high * w + w_low; - } - if (v4_cond) { + if (v4_cond) v4_pos = h_high * w + w_high; - } w1 = hh * hw; w2 = hh * lw; @@ -223,21 +219,17 @@ int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vec float val = 0.f; if (cond) { float v1 = 0.f; - if (v1_cond) { + if (v1_cond) v1 = data_im_channel_ptr[v1_pos]; - } float v2 = 0.f; - if (v2_cond) { + if (v2_cond) v2 = data_im_channel_ptr[v2_pos]; - } float v3 = 0.f; - if (v3_cond) { + if (v3_cond) v3 = data_im_channel_ptr[v3_pos]; - } float v4 = 0.f; - if (v4_cond) { + if (v4_cond) v4 = data_im_channel_ptr[v4_pos]; - } val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; } *data_col_ptr = val * mask_; From 06defc5b2dbc28441cae0813688b644222a2e67b Mon Sep 17 00:00:00 2001 From: miemie2013 <53960695+miemie2013@users.noreply.github.com> Date: Mon, 25 Jul 2022 16:00:39 +0800 Subject: [PATCH 05/18] add unittest and docs --- docs/developer-guide/operators.md | 30 ++++++++ tests/CMakeLists.txt | 1 + tests/test_deformableconv2d.cpp | 116 ++++++++++++++++++++++++++++++ 3 files changed, 147 insertions(+) create mode 100644 tests/test_deformableconv2d.cpp diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md index e95a37c0c4f..dbef4a62694 100644 --- a/docs/developer-guide/operators.md +++ b/docs/developer-guide/operators.md @@ -76,6 +76,7 @@ * [Threshold](#threshold) * [Tile](#tile) * [UnaryOp](#unaryop) +* [DeformableConv2D](#deformableconv2d) # AbsVal ``` @@ -1641,3 +1642,32 @@ Operation type: - 14 = ATAN - 15 = RECIPROCAL - 16 = TANH + +# DeformableConv2D +``` +x2 = deformableconv2d(x, offset, mask, weight, kernel, stride, dilation) + bias +y = activation(x2, act_type, act_params) +``` + +| param id | name | type | default | description | +| --------- | ------------- | ----- | --------- | ----------------- | +| 0 | num_output | int | 0 | | +| 1 | kernel_w | int | 0 | | +| 2 | dilation_w | int | 1 | | +| 3 | stride_w | int | 1 | | +| 4 | pad_left | int | 0 | | +| 5 | bias_term | int | 0 | | +| 6 | weight_data_size| int | 0 | | +| 9 | activation_type| int | 0 | | +| 10 | activation_params| array | [ ] | | +| 11 | kernel_h | int | kernel_w | | +| 12 | dilation_h | int | dilation_w | | +| 13 | stride_h | int | stride_w | | +| 14 | pad_top | int | pad_left | | +| 15 | pad_right | int | pad_left | | +| 16 | pad_bottom | int | pad_top | | + +| weight | type | shape | +| ------------- | ----- | --------------------- | +| weight_data | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] | +| bias_data | float | [num_output] | diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ee54d3a7cbf..494366b2bf7 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -134,3 +134,4 @@ ncnn_add_layer_test(TanH) ncnn_add_layer_test(Tile) ncnn_add_layer_test(UnaryOp) ncnn_add_layer_test(Yolov3DetectionOutput) +ncnn_add_layer_test(DeformableConv2D) diff --git a/tests/test_deformableconv2d.cpp b/tests/test_deformableconv2d.cpp new file mode 100644 index 00000000000..611ec427042 --- /dev/null +++ b/tests/test_deformableconv2d.cpp @@ -0,0 +1,116 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer/deformableconv2d.h" +#include "testutil.h" + +static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias) +{ + const int kernel_extent_w = dilation * (kernel - 1) + 1; + const int kernel_extent_h = dilation * (kernel - 1) + 1; + const int out_w = (w + pad + pad - kernel_extent_w) / stride + 1; + const int out_h = (h + pad + pad - kernel_extent_h) / stride + 1; + std::vector a(3); + a[0] = RandomMat(w, h, c); + a[1] = RandomMat(out_w, out_h, kernel * kernel * 2); + a[2] = RandomMat(out_w, out_h, kernel * kernel); + + ncnn::ParamDict pd; + pd.set(0, outch); + pd.set(1, kernel); + pd.set(2, dilation); + pd.set(3, stride); + pd.set(4, pad); + pd.set(5, bias); + pd.set(6, outch * c * kernel * kernel); + + int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 + ncnn::Mat activation_params(2); + activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha + activation_params[1] = RandomFloat(0, 1); // beta + pd.set(9, activation_type); + pd.set(10, activation_params); + + std::vector weights(bias ? 2 : 1); + weights[0] = RandomMat(outch * c * kernel * kernel); + if (bias) + weights[1] = RandomMat(outch); + + float epsilon = 0.001; + int ret = test_layer("DeformableConv2D", pd, weights, a, epsilon); + if (ret != 0) + { + fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]); + } + + return ret; +} + +static int test_deformableconv2d_0() +{ + static const int kdsp[16][4] = { + {1, 1, 1, 0}, + {1, 1, 2, 0}, + {2, 1, 1, 1}, + {2, 1, 2, -233}, + {3, 1, 1, 1}, + {3, 1, 2, 1}, + {3, 2, 1, 1}, + {4, 1, 1, -233}, + {4, 1, 2, -234}, + {4, 2, 1, -234}, + {5, 1, 1, 2}, + {5, 1, 2, 2}, + {5, 2, 2, 2}, + {7, 1, 1, 3}, + {7, 1, 2, 3}, + {7, 2, 1, -233}, + }; + + for (int i = 0; i < 16; i++) + { + const int k = kdsp[i][0]; + const int d = kdsp[i][1]; + const int s = kdsp[i][2]; + const int p = kdsp[i][3]; + + int ret = 0 + || test_deformableconv2d(9, 7, 1, 1, k, d, s, p, 1) + || test_deformableconv2d(9, 7, 4, 13, k, d, s, p, 0) + || test_deformableconv2d(9, 7, 13, 4, k, d, s, p, 1) + || test_deformableconv2d(9, 7, 4, 8, k, d, s, p, 0) + || test_deformableconv2d(9, 7, 8, 4, k, d, s, p, 1) + || test_deformableconv2d(9, 7, 8, 13, k, d, s, p, 0) + || test_deformableconv2d(9, 7, 13, 8, k, d, s, p, 1) + || test_deformableconv2d(9, 7, 16, 16, k, d, s, p, 0); + + if (ret != 0) + return -1; + } + + return 0 + || test_deformableconv2d(7, 5, 24, 32, 4, 2, 2, 2, 1) + || test_deformableconv2d(7, 5, 32, 24, 4, 2, 2, 2, 1) + || test_deformableconv2d(7, 5, 28, 32, 4, 2, 2, 2, 1) + || test_deformableconv2d(7, 5, 32, 28, 4, 2, 2, 2, 1) + || test_deformableconv2d(7, 5, 26, 32, 4, 2, 2, 2, 1) + || test_deformableconv2d(7, 5, 32, 26, 4, 2, 2, 2, 1); +} + +int main() +{ + SRAND(7767517); + + return test_deformableconv2d_0(); +} From 01c51df39b787c280b7387118cf6e5701b2eba61 Mon Sep 17 00:00:00 2001 From: miemie2013 <53960695+miemie2013@users.noreply.github.com> Date: Mon, 25 Jul 2022 16:10:49 +0800 Subject: [PATCH 06/18] fix pad negative value. --- tests/test_deformableconv2d.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_deformableconv2d.cpp b/tests/test_deformableconv2d.cpp index 611ec427042..9b4cb256e55 100644 --- a/tests/test_deformableconv2d.cpp +++ b/tests/test_deformableconv2d.cpp @@ -63,19 +63,19 @@ static int test_deformableconv2d_0() {1, 1, 1, 0}, {1, 1, 2, 0}, {2, 1, 1, 1}, - {2, 1, 2, -233}, + {2, 1, 2, 0}, {3, 1, 1, 1}, {3, 1, 2, 1}, {3, 2, 1, 1}, - {4, 1, 1, -233}, - {4, 1, 2, -234}, - {4, 2, 1, -234}, + {4, 1, 1, 0}, + {4, 1, 2, 1}, + {4, 2, 1, 1}, {5, 1, 1, 2}, {5, 1, 2, 2}, {5, 2, 2, 2}, {7, 1, 1, 3}, {7, 1, 2, 3}, - {7, 2, 1, -233}, + {7, 2, 1, 3}, }; for (int i = 0; i < 16; i++) From 0458b59d845735685b29516d84bf8295988ee2f2 Mon Sep 17 00:00:00 2001 From: miemie2013 Date: Mon, 25 Jul 2022 08:41:52 +0000 Subject: [PATCH 07/18] apply code-format changes --- src/layer/deformableconv2d.cpp | 6 ++++-- src/layer/x86/deformableconv2d_x86.cpp | 8 +++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/layer/deformableconv2d.cpp b/src/layer/deformableconv2d.cpp index 46a674c6051..fea22f77697 100644 --- a/src/layer/deformableconv2d.cpp +++ b/src/layer/deformableconv2d.cpp @@ -124,7 +124,8 @@ int DeformableConv2D::forward(const std::vector& bottom_blobs, std::vector< bool v2_cond = false; bool v3_cond = false; bool v4_cond = false; - if (cond) { + if (cond) + { h_low = floor(h_im); w_low = floor(w_im); h_high = h_low + 1; @@ -149,7 +150,8 @@ int DeformableConv2D::forward(const std::vector& bottom_blobs, std::vector< for (int c_im = 0; c_im < in_c; c_im++) { float val = 0.f; - if (cond) { + if (cond) + { float v1 = 0.f; if (v1_cond) v1 = bottom_blob.channel(c_im).row(h_low)[w_low]; diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp index 3efa3b82cea..6d89296a295 100644 --- a/src/layer/x86/deformableconv2d_x86.cpp +++ b/src/layer/x86/deformableconv2d_x86.cpp @@ -183,7 +183,8 @@ int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vec int v2_pos = 0; int v3_pos = 0; int v4_pos = 0; - if (cond) { + if (cond) + { int h_low = floor(h_im); int w_low = floor(w_im); int h_high = h_low + 1; @@ -217,7 +218,8 @@ int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vec for (int c_im = 0; c_im < in_c; c_im++) { float val = 0.f; - if (cond) { + if (cond) + { float v1 = 0.f; if (v1_cond) v1 = data_im_channel_ptr[v1_pos]; @@ -234,7 +236,7 @@ int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vec } *data_col_ptr = val * mask_; data_col_ptr += 1; - data_im_channel_ptr += h*w; + data_im_channel_ptr += h * w; } } } From 5a8bac7938331ea82a78c489095ee9583f69ddc3 Mon Sep 17 00:00:00 2001 From: miemie2013 <53960695+miemie2013@users.noreply.github.com> Date: Tue, 26 Jul 2022 14:19:05 +0800 Subject: [PATCH 08/18] fix ptr bug --- src/layer/deformableconv2d.cpp | 21 +++++++-------------- src/layer/x86/deformableconv2d_x86.cpp | 16 ++++------------ tests/test_deformableconv2d.cpp | 2 +- 3 files changed, 12 insertions(+), 27 deletions(-) diff --git a/src/layer/deformableconv2d.cpp b/src/layer/deformableconv2d.cpp index fea22f77697..3ca0c68acf3 100644 --- a/src/layer/deformableconv2d.cpp +++ b/src/layer/deformableconv2d.cpp @@ -82,8 +82,9 @@ int DeformableConv2D::forward(const std::vector& bottom_blobs, std::vector< if (output.empty()) return -100; - const float* weight_ptr = weight_data; - const float* bias_ptr = weight_data; + Mat weight_flatten = weight_data.reshape(weight_data_size); + const float* weight_ptr = weight_flatten; + const float* bias_ptr = weight_flatten; if (bias_term) bias_ptr = bias_data; @@ -152,18 +153,10 @@ int DeformableConv2D::forward(const std::vector& bottom_blobs, std::vector< float val = 0.f; if (cond) { - float v1 = 0.f; - if (v1_cond) - v1 = bottom_blob.channel(c_im).row(h_low)[w_low]; - float v2 = 0.f; - if (v2_cond) - v2 = bottom_blob.channel(c_im).row(h_low)[w_high]; - float v3 = 0.f; - if (v3_cond) - v3 = bottom_blob.channel(c_im).row(h_high)[w_low]; - float v4 = 0.f; - if (v4_cond) - v4 = bottom_blob.channel(c_im).row(h_high)[w_high]; + float v1 = v1_cond ? bottom_blob.channel(c_im).row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? bottom_blob.channel(c_im).row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? bottom_blob.channel(c_im).row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? bottom_blob.channel(c_im).row(h_high)[w_high] : 0.f; val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; } sum += val * mask_ * weight_ptr[((oc * in_c + c_im) * kernel_h + i) * kernel_w + j]; diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp index 6d89296a295..e41265f8b1d 100644 --- a/src/layer/x86/deformableconv2d_x86.cpp +++ b/src/layer/x86/deformableconv2d_x86.cpp @@ -220,18 +220,10 @@ int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vec float val = 0.f; if (cond) { - float v1 = 0.f; - if (v1_cond) - v1 = data_im_channel_ptr[v1_pos]; - float v2 = 0.f; - if (v2_cond) - v2 = data_im_channel_ptr[v2_pos]; - float v3 = 0.f; - if (v3_cond) - v3 = data_im_channel_ptr[v3_pos]; - float v4 = 0.f; - if (v4_cond) - v4 = data_im_channel_ptr[v4_pos]; + float v1 = v1_cond ? data_im_channel_ptr[v1_pos] : 0.f; + float v2 = v2_cond ? data_im_channel_ptr[v2_pos] : 0.f; + float v3 = v3_cond ? data_im_channel_ptr[v3_pos] : 0.f; + float v4 = v4_cond ? data_im_channel_ptr[v4_pos] : 0.f; val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; } *data_col_ptr = val * mask_; diff --git a/tests/test_deformableconv2d.cpp b/tests/test_deformableconv2d.cpp index 9b4cb256e55..01511e54496 100644 --- a/tests/test_deformableconv2d.cpp +++ b/tests/test_deformableconv2d.cpp @@ -48,7 +48,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int weights[1] = RandomMat(outch); float epsilon = 0.001; - int ret = test_layer("DeformableConv2D", pd, weights, a, epsilon); + int ret = test_layer("DeformableConv2D", pd, weights, a, 1, epsilon); if (ret != 0) { fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]); From d1463f1ac73c74ff8fd80957e21c7f11ff057d5f Mon Sep 17 00:00:00 2001 From: miemie2013 <53960695+miemie2013@users.noreply.github.com> Date: Tue, 26 Jul 2022 14:49:54 +0800 Subject: [PATCH 09/18] transform weight data in derived class create_pipeline() --- src/layer/x86/deformableconv2d_x86.cpp | 45 +++++++++----------------- src/layer/x86/deformableconv2d_x86.h | 2 -- 2 files changed, 15 insertions(+), 32 deletions(-) diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp index e41265f8b1d..54668e3d293 100644 --- a/src/layer/x86/deformableconv2d_x86.cpp +++ b/src/layer/x86/deformableconv2d_x86.cpp @@ -27,44 +27,29 @@ DeformableConv2D_x86::DeformableConv2D_x86() permute = 0; } -int DeformableConv2D_x86::load_model(const ModelBin& mb) +int DeformableConv2D_x86::create_pipeline(const Option& opt) { - weight_data = mb.load(weight_data_size, 0); - if (weight_data.empty()) - return -100; - - if (bias_term) - { - bias_data = mb.load(num_output, 1); - if (bias_data.empty()) - return -100; - } - const int in_c = weight_data_size / (num_output * kernel_h * kernel_w); - weight_data = weight_data.reshape(kernel_w * kernel_h, in_c, num_output); - weight_data_t.create(in_c, kernel_w * kernel_h, num_output); - if (weight_data_t.empty()) - return -100; - for (int q = 0; q < num_output; q++) { - const Mat m = weight_data.channel(q); - float* outptr = weight_data_t.channel(q); - - for (int i = 0; i < kernel_w * kernel_h; i++) + weight_data = weight_data.reshape(kernel_w * kernel_h, in_c, num_output); + weight_data_t.create(in_c, kernel_w * kernel_h, num_output); + if (weight_data_t.empty()) + return -100; + for (int q = 0; q < num_output; q++) { - for (int j = 0; j < in_c; j++) + const Mat m = weight_data.channel(q); + float* outptr = weight_data_t.channel(q); + + for (int i = 0; i < kernel_w * kernel_h; i++) { - *outptr++ = m.row(j)[i]; + for (int j = 0; j < in_c; j++) + { + *outptr++ = m.row(j)[i]; + } } } - } - weight_data_t = weight_data_t.reshape(in_c * kernel_w * kernel_h, num_output); - return 0; -} + weight_data_t = weight_data_t.reshape(in_c * kernel_w * kernel_h, num_output); -int DeformableConv2D_x86::create_pipeline(const Option& opt) -{ - { inner_product = ncnn::create_layer(ncnn::LayerType::InnerProduct); ncnn::ParamDict pd; pd.set(0, num_output); diff --git a/src/layer/x86/deformableconv2d_x86.h b/src/layer/x86/deformableconv2d_x86.h index 50dabdbeaaf..0e21c9392af 100644 --- a/src/layer/x86/deformableconv2d_x86.h +++ b/src/layer/x86/deformableconv2d_x86.h @@ -24,8 +24,6 @@ class DeformableConv2D_x86 : virtual public DeformableConv2D public: DeformableConv2D_x86(); - virtual int load_model(const ModelBin& mb); - virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); From e8fb93d73155ade86cd6ca54bdb7f4f1330c12e0 Mon Sep 17 00:00:00 2001 From: miemie2013 <53960695+miemie2013@users.noreply.github.com> Date: Tue, 26 Jul 2022 17:17:34 +0800 Subject: [PATCH 10/18] Add DeformableConv2D --- src/layer/deformableconv2d.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/layer/deformableconv2d.cpp b/src/layer/deformableconv2d.cpp index 3ca0c68acf3..c933dd1bddb 100644 --- a/src/layer/deformableconv2d.cpp +++ b/src/layer/deformableconv2d.cpp @@ -82,9 +82,8 @@ int DeformableConv2D::forward(const std::vector& bottom_blobs, std::vector< if (output.empty()) return -100; - Mat weight_flatten = weight_data.reshape(weight_data_size); - const float* weight_ptr = weight_flatten; - const float* bias_ptr = weight_flatten; + const float* weight_ptr = weight_data; + const float* bias_ptr = weight_data; if (bias_term) bias_ptr = bias_data; From 53ea2862756ffaee3f47d8cbbb2b8f3b3cd90b77 Mon Sep 17 00:00:00 2001 From: miemie2013 <53960695+miemie2013@users.noreply.github.com> Date: Wed, 27 Jul 2022 11:13:02 +0800 Subject: [PATCH 11/18] do not modify weight_data in x86 impl. --- src/layer/x86/deformableconv2d_x86.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp index 54668e3d293..04b4593141f 100644 --- a/src/layer/x86/deformableconv2d_x86.cpp +++ b/src/layer/x86/deformableconv2d_x86.cpp @@ -31,13 +31,13 @@ int DeformableConv2D_x86::create_pipeline(const Option& opt) { const int in_c = weight_data_size / (num_output * kernel_h * kernel_w); { - weight_data = weight_data.reshape(kernel_w * kernel_h, in_c, num_output); + Mat weight_3d = weight_data.reshape(kernel_w * kernel_h, in_c, num_output); weight_data_t.create(in_c, kernel_w * kernel_h, num_output); if (weight_data_t.empty()) return -100; for (int q = 0; q < num_output; q++) { - const Mat m = weight_data.channel(q); + const Mat m = weight_3d.channel(q); float* outptr = weight_data_t.channel(q); for (int i = 0; i < kernel_w * kernel_h; i++) @@ -48,6 +48,7 @@ int DeformableConv2D_x86::create_pipeline(const Option& opt) } } } + weight_3d.release(); weight_data_t = weight_data_t.reshape(in_c * kernel_w * kernel_h, num_output); inner_product = ncnn::create_layer(ncnn::LayerType::InnerProduct); From 13e8cb55abeb5e56da5fbc496a6c673f7d8aca41 Mon Sep 17 00:00:00 2001 From: miemie2013 <53960695+miemie2013@users.noreply.github.com> Date: Wed, 27 Jul 2022 16:14:51 +0800 Subject: [PATCH 12/18] fix year to 2022 --- docs/developer-guide/operators.md | 60 +++++++++++++------------- src/layer/deformableconv2d.cpp | 2 +- src/layer/x86/deformableconv2d_x86.cpp | 2 +- tests/CMakeLists.txt | 2 +- 4 files changed, 33 insertions(+), 33 deletions(-) diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md index dbef4a62694..5366da1e112 100644 --- a/docs/developer-guide/operators.md +++ b/docs/developer-guide/operators.md @@ -21,6 +21,7 @@ * [DeconvolutionDepthWise](#deconvolutiondepthwise) * [DeconvolutionDepthWise1D](#deconvolutiondepthwise1d) * [DeconvolutionDepthWise3D](#deconvolutiondepthwise3d) +* [DeformableConv2D](#deformableconv2d) * [Dequantize](#dequantize) * [Dropout](#dropout) * [Eltwise](#eltwise) @@ -76,7 +77,6 @@ * [Threshold](#threshold) * [Tile](#tile) * [UnaryOp](#unaryop) -* [DeformableConv2D](#deformableconv2d) # AbsVal ``` @@ -665,6 +665,35 @@ y = activation(x3, act_type, act_params) | weight_data | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] | | bias_data | float | [num_output] | +# DeformableConv2D +``` +x2 = deformableconv2d(x, offset, mask, weight, kernel, stride, dilation) + bias +y = activation(x2, act_type, act_params) +``` + +| param id | name | type | default | description | +| --------- | ------------- | ----- | --------- | ----------------- | +| 0 | num_output | int | 0 | | +| 1 | kernel_w | int | 0 | | +| 2 | dilation_w | int | 1 | | +| 3 | stride_w | int | 1 | | +| 4 | pad_left | int | 0 | | +| 5 | bias_term | int | 0 | | +| 6 | weight_data_size| int | 0 | | +| 9 | activation_type| int | 0 | | +| 10 | activation_params| array | [ ] | | +| 11 | kernel_h | int | kernel_w | | +| 12 | dilation_h | int | dilation_w | | +| 13 | stride_h | int | stride_w | | +| 14 | pad_top | int | pad_left | | +| 15 | pad_right | int | pad_left | | +| 16 | pad_bottom | int | pad_top | | + +| weight | type | shape | +| ------------- | ----- | --------------------- | +| weight_data | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] | +| bias_data | float | [num_output] | + # Dequantize ``` y = x * scale + bias @@ -1642,32 +1671,3 @@ Operation type: - 14 = ATAN - 15 = RECIPROCAL - 16 = TANH - -# DeformableConv2D -``` -x2 = deformableconv2d(x, offset, mask, weight, kernel, stride, dilation) + bias -y = activation(x2, act_type, act_params) -``` - -| param id | name | type | default | description | -| --------- | ------------- | ----- | --------- | ----------------- | -| 0 | num_output | int | 0 | | -| 1 | kernel_w | int | 0 | | -| 2 | dilation_w | int | 1 | | -| 3 | stride_w | int | 1 | | -| 4 | pad_left | int | 0 | | -| 5 | bias_term | int | 0 | | -| 6 | weight_data_size| int | 0 | | -| 9 | activation_type| int | 0 | | -| 10 | activation_params| array | [ ] | | -| 11 | kernel_h | int | kernel_w | | -| 12 | dilation_h | int | dilation_w | | -| 13 | stride_h | int | stride_w | | -| 14 | pad_top | int | pad_left | | -| 15 | pad_right | int | pad_left | | -| 16 | pad_bottom | int | pad_top | | - -| weight | type | shape | -| ------------- | ----- | --------------------- | -| weight_data | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] | -| bias_data | float | [num_output] | diff --git a/src/layer/deformableconv2d.cpp b/src/layer/deformableconv2d.cpp index c933dd1bddb..ccd3b422879 100644 --- a/src/layer/deformableconv2d.cpp +++ b/src/layer/deformableconv2d.cpp @@ -1,6 +1,6 @@ // Tencent is pleased to support the open source community by making ncnn available. // -// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp index 04b4593141f..e6aabeabf39 100644 --- a/src/layer/x86/deformableconv2d_x86.cpp +++ b/src/layer/x86/deformableconv2d_x86.cpp @@ -1,6 +1,6 @@ // Tencent is pleased to support the open source community by making ncnn available. // -// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 494366b2bf7..a88c6562db2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -77,6 +77,7 @@ ncnn_add_layer_test(DeconvolutionDepthWise) ncnn_add_layer_test(DeconvolutionDepthWise1D) ncnn_add_layer_test(DeconvolutionDepthWise3D) ncnn_add_layer_test(DeepCopy) +ncnn_add_layer_test(DeformableConv2D) ncnn_add_layer_test(Dequantize) ncnn_add_layer_test(Dropout) ncnn_add_layer_test(Einsum) @@ -134,4 +135,3 @@ ncnn_add_layer_test(TanH) ncnn_add_layer_test(Tile) ncnn_add_layer_test(UnaryOp) ncnn_add_layer_test(Yolov3DetectionOutput) -ncnn_add_layer_test(DeformableConv2D) From bb1cdf8acf13e368ef47dd622a96c6f7bb60d9f5 Mon Sep 17 00:00:00 2001 From: nihui Date: Wed, 27 Jul 2022 22:10:37 +0800 Subject: [PATCH 13/18] pnnx torchvision deformconv2d conversion --- tools/pnnx/src/CMakeLists.txt | 1 + .../pass_ncnn/torchvision_DeformConv2d.cpp | 70 +++++++++++++++++++ tools/pnnx/tests/ncnn/CMakeLists.txt | 4 ++ .../ncnn/test_torchvision_DeformConv2d.py | 59 ++++++++++++++++ 4 files changed, 134 insertions(+) create mode 100644 tools/pnnx/src/pass_ncnn/torchvision_DeformConv2d.cpp create mode 100644 tools/pnnx/tests/ncnn/test_torchvision_DeformConv2d.py diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index feeac2d2692..49b79d9f7d9 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -479,6 +479,7 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/torch_sum.cpp pass_ncnn/torch_transpose.cpp pass_ncnn/torch_unsqueeze.cpp + pass_ncnn/torchvision_DeformConv2d.cpp ) set(pnnx_SRCS diff --git a/tools/pnnx/src/pass_ncnn/torchvision_DeformConv2d.cpp b/tools/pnnx/src/pass_ncnn/torchvision_DeformConv2d.cpp new file mode 100644 index 00000000000..92c06104465 --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/torchvision_DeformConv2d.cpp @@ -0,0 +1,70 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class torchvision_DeformConv2d : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +torchvision.ops.DeformConv2d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "DeformableConv2D"; + } + + const char* name_str() const + { + return "deformconv2d"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + op->params["0"] = captured_params.at("out_channels"); + op->params["1"] = captured_params.at("kernel_size").ai[1]; + op->params["11"] = captured_params.at("kernel_size").ai[0]; + op->params["2"] = captured_params.at("dilation").ai[1]; + op->params["12"] = captured_params.at("dilation").ai[0]; + op->params["3"] = captured_params.at("stride").ai[1]; + op->params["13"] = captured_params.at("stride").ai[0]; + op->params["4"] = captured_params.at("padding").ai[1]; + op->params["14"] = captured_params.at("padding").ai[0]; + op->params["5"] = captured_params.at("bias").b ? 1 : 0; + op->params["6"] = (int)(captured_attrs.at("op_0.weight").data.size() / sizeof(float)); + + op->attrs["0"] = Attribute(); + op->attrs["0"].data = {0, 0, 0, 0}; + op->attrs["1"] = captured_attrs.at("op_0.weight"); + if (captured_params.at("bias").b) + op->attrs["2"] = captured_attrs.at("op_0.bias"); + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torchvision_DeformConv2d, 20) + +} // namespace ncnn + +} // namespace pnnx diff --git a/tools/pnnx/tests/ncnn/CMakeLists.txt b/tools/pnnx/tests/ncnn/CMakeLists.txt index 2bb6548220f..340fc162039 100644 --- a/tools/pnnx/tests/ncnn/CMakeLists.txt +++ b/tools/pnnx/tests/ncnn/CMakeLists.txt @@ -165,3 +165,7 @@ if(Torch_VERSION VERSION_GREATER_EQUAL "1.9") pnnx_ncnn_add_test(F_mish) pnnx_ncnn_add_test(nn_Mish) endif() + +if(TorchVision_FOUND) + pnnx_add_test(torchvision_DeformConv2d) +endif() diff --git a/tools/pnnx/tests/ncnn/test_torchvision_DeformConv2d.py b/tools/pnnx/tests/ncnn/test_torchvision_DeformConv2d.py new file mode 100644 index 00000000000..b20dae01955 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torchvision_DeformConv2d.py @@ -0,0 +1,59 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.conv_0 = nn.Conv2d(in_channels=12, out_channels=2*3*3, kernel_size=3) + self.conv_1 = torchvision.ops.DeformConv2d(in_channels=12, out_channels=16, kernel_size=3) + + def forward(self, x): + offset = self.conv_0(x) + x = self.conv_1(x, offset) + return x + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12, 64, 64) + + a = net(x) + + # export torchscript + mod = torch.jit.trace(net, x) + mod.save("test_torchvision_DeformConv2d.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torchvision_DeformConv2d.pt inputshape=[1,12,64,64]") + + # ncnn inference + import test_torchvision_DeformConv2d_ncnn + b = test_torchvision_DeformConv2d_ncnn.test_inference() + + return torch.equal(a, b) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) From a2bb8266abc947b3b77c577f6f893054b71349fc Mon Sep 17 00:00:00 2001 From: nihui Date: Thu, 28 Jul 2022 09:51:22 +0800 Subject: [PATCH 14/18] deformableconv2d with optional mask, finish pnnx conversion and test --- src/layer/deformableconv2d.cpp | 5 +- src/layer/x86/deformableconv2d_x86.cpp | 8 +-- .../pass_ncnn/torchvision_DeformConv2d.cpp | 55 ++++++++++++++++++- tools/pnnx/tests/ncnn/CMakeLists.txt | 2 +- .../ncnn/test_torchvision_DeformConv2d.py | 18 ++++-- .../tests/test_torchvision_DeformConv2d.py | 16 ++++-- 6 files changed, 82 insertions(+), 22 deletions(-) diff --git a/src/layer/deformableconv2d.cpp b/src/layer/deformableconv2d.cpp index ccd3b422879..628e8de0aa0 100644 --- a/src/layer/deformableconv2d.cpp +++ b/src/layer/deformableconv2d.cpp @@ -63,7 +63,8 @@ int DeformableConv2D::forward(const std::vector& bottom_blobs, std::vector< { const Mat& bottom_blob = bottom_blobs[0]; const Mat& offset = bottom_blobs[1]; - const Mat& mask = bottom_blobs[2]; + + const bool has_mask = (bottom_blobs.size() == 3); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -106,7 +107,7 @@ int DeformableConv2D::forward(const std::vector& bottom_blobs, std::vector< { const float offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; const float offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - const float mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + const float mask_ = has_mask ? bottom_blobs[2].channel(i * kernel_w + j).row(h_col)[w_col] : 1.f; const float h_im = h_in + i * dilation_h + offset_h; const float w_im = w_in + j * dilation_w + offset_w; diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp index e6aabeabf39..869815283d9 100644 --- a/src/layer/x86/deformableconv2d_x86.cpp +++ b/src/layer/x86/deformableconv2d_x86.cpp @@ -98,7 +98,8 @@ int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vec { const Mat& bottom_blob = bottom_blobs[0]; const Mat& offset = bottom_blobs[1]; - const Mat& mask = bottom_blobs[2]; + + const bool has_mask = (bottom_blobs.size() == 3); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -126,10 +127,8 @@ int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vec Mat bottom_blob_flatten = bottom_blob.reshape(w * h * in_c); Mat offset_flatten = offset.reshape(offset.w * offset.h * offset.c); - Mat mask_flatten = mask.reshape(mask.w * mask.h * mask.c); const float* data_im_ptr = bottom_blob_flatten; const float* data_offset_ptr = offset_flatten; - const float* data_mask_ptr = mask_flatten; float* im2col_ptr = im2col; // im2col @@ -147,11 +146,10 @@ int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vec { const int data_offset_h_ptr = (((i * kernel_w + j) * 2) * out_h + h_col) * out_w + w_col; const int data_offset_w_ptr = (((i * kernel_w + j) * 2 + 1) * out_h + h_col) * out_w + w_col; - const int data_mask_hw_ptr = ((i * kernel_w + j) * out_h + h_col) * out_w + w_col; const float offset_h = data_offset_ptr[data_offset_h_ptr]; const float offset_w = data_offset_ptr[data_offset_w_ptr]; - const float mask_ = data_mask_ptr[data_mask_hw_ptr]; + const float mask_ = has_mask ? bottom_blobs[2].channel(i * kernel_w + j).row(h_col)[w_col] : 1.f; const float h_im = h_in + i * dilation_h + offset_h; const float w_im = w_in + j * dilation_w + offset_w; diff --git a/tools/pnnx/src/pass_ncnn/torchvision_DeformConv2d.cpp b/tools/pnnx/src/pass_ncnn/torchvision_DeformConv2d.cpp index 92c06104465..0c012e4b502 100644 --- a/tools/pnnx/src/pass_ncnn/torchvision_DeformConv2d.cpp +++ b/tools/pnnx/src/pass_ncnn/torchvision_DeformConv2d.cpp @@ -24,9 +24,57 @@ class torchvision_DeformConv2d : public GraphRewriterPass const char* match_pattern_graph() const { return R"PNNXIR(7767517 -3 2 -pnnx.Input input 0 1 input -torchvision.ops.DeformConv2d op_0 1 1 input out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias +4 3 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 offset +torchvision.ops.DeformConv2d op_0 2 1 input offset out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "DeformableConv2D"; + } + + const char* name_str() const + { + return "deformconv2d"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + op->params["0"] = captured_params.at("out_channels"); + op->params["1"] = captured_params.at("kernel_size").ai[1]; + op->params["11"] = captured_params.at("kernel_size").ai[0]; + op->params["2"] = captured_params.at("dilation").ai[1]; + op->params["12"] = captured_params.at("dilation").ai[0]; + op->params["3"] = captured_params.at("stride").ai[1]; + op->params["13"] = captured_params.at("stride").ai[0]; + op->params["4"] = captured_params.at("padding").ai[1]; + op->params["14"] = captured_params.at("padding").ai[0]; + op->params["5"] = captured_params.at("bias").b ? 1 : 0; + op->params["6"] = (int)(captured_attrs.at("op_0.weight").data.size() / sizeof(float)); + + op->attrs["0"] = Attribute(); + op->attrs["0"].data = {0, 0, 0, 0}; + op->attrs["1"] = captured_attrs.at("op_0.weight"); + if (captured_params.at("bias").b) + op->attrs["2"] = captured_attrs.at("op_0.bias"); + } +}; + +class torchvision_DeformConv2d_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 offset +pnnx.Input input_2 0 1 mask +torchvision.ops.DeformConv2d op_0 3 1 input offset mask out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding=%padding dilation=%dilation groups=1 bias=%bias @weight @bias pnnx.Output output 1 0 out )PNNXIR"; } @@ -64,6 +112,7 @@ pnnx.Output output 1 0 out }; REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torchvision_DeformConv2d, 20) +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torchvision_DeformConv2d_1, 20) } // namespace ncnn diff --git a/tools/pnnx/tests/ncnn/CMakeLists.txt b/tools/pnnx/tests/ncnn/CMakeLists.txt index 340fc162039..f99b9429882 100644 --- a/tools/pnnx/tests/ncnn/CMakeLists.txt +++ b/tools/pnnx/tests/ncnn/CMakeLists.txt @@ -167,5 +167,5 @@ if(Torch_VERSION VERSION_GREATER_EQUAL "1.9") endif() if(TorchVision_FOUND) - pnnx_add_test(torchvision_DeformConv2d) + pnnx_ncnn_add_test(torchvision_DeformConv2d) endif() diff --git a/tools/pnnx/tests/ncnn/test_torchvision_DeformConv2d.py b/tools/pnnx/tests/ncnn/test_torchvision_DeformConv2d.py index b20dae01955..adab655f1e8 100644 --- a/tools/pnnx/tests/ncnn/test_torchvision_DeformConv2d.py +++ b/tools/pnnx/tests/ncnn/test_torchvision_DeformConv2d.py @@ -24,19 +24,25 @@ def __init__(self): self.conv_0 = nn.Conv2d(in_channels=12, out_channels=2*3*3, kernel_size=3) self.conv_1 = torchvision.ops.DeformConv2d(in_channels=12, out_channels=16, kernel_size=3) + self.conv_2 = nn.Conv2d(in_channels=12, out_channels=3*3, kernel_size=3) + self.conv_3 = torchvision.ops.DeformConv2d(in_channels=12, out_channels=16, kernel_size=3) + def forward(self, x): offset = self.conv_0(x) - x = self.conv_1(x, offset) - return x + x1 = self.conv_1(x, offset) + + mask = self.conv_2(x) + x2 = self.conv_3(x, offset, mask) + return x1, x2 def test(): - net = Model() + net = Model().half().float() net.eval() torch.manual_seed(0) x = torch.rand(1, 12, 64, 64) - a = net(x) + a0, a1 = net(x) # export torchscript mod = torch.jit.trace(net, x) @@ -48,9 +54,9 @@ def test(): # ncnn inference import test_torchvision_DeformConv2d_ncnn - b = test_torchvision_DeformConv2d_ncnn.test_inference() + b0, b1 = test_torchvision_DeformConv2d_ncnn.test_inference() - return torch.equal(a, b) + return torch.allclose(a0, b0, 1e-4, 1e-4) and torch.allclose(a1, b1, 1e-4, 1e-4) if __name__ == "__main__": if test(): diff --git a/tools/pnnx/tests/test_torchvision_DeformConv2d.py b/tools/pnnx/tests/test_torchvision_DeformConv2d.py index 951187330cf..286cca7583a 100644 --- a/tools/pnnx/tests/test_torchvision_DeformConv2d.py +++ b/tools/pnnx/tests/test_torchvision_DeformConv2d.py @@ -24,10 +24,16 @@ def __init__(self): self.conv_0 = nn.Conv2d(in_channels=12, out_channels=2*3*3, kernel_size=3) self.conv_1 = torchvision.ops.DeformConv2d(in_channels=12, out_channels=16, kernel_size=3) + self.conv_2 = nn.Conv2d(in_channels=12, out_channels=3*3, kernel_size=3) + self.conv_3 = torchvision.ops.DeformConv2d(in_channels=12, out_channels=16, kernel_size=3) + def forward(self, x): offset = self.conv_0(x) - x = self.conv_1(x, offset) - return x + x1 = self.conv_1(x, offset) + + mask = self.conv_2(x) + x2 = self.conv_3(x, offset, mask) + return x1, x2 def test(): net = Model() @@ -36,7 +42,7 @@ def test(): torch.manual_seed(0) x = torch.rand(1, 12, 64, 64) - a = net(x) + a0, a1 = net(x) # export torchscript mod = torch.jit.trace(net, x) @@ -48,9 +54,9 @@ def test(): # pnnx inference import test_torchvision_DeformConv2d_pnnx - b = test_torchvision_DeformConv2d_pnnx.test_inference() + b0, b1 = test_torchvision_DeformConv2d_pnnx.test_inference() - return torch.equal(a, b) + return torch.equal(a0, b0) and torch.equal(a1, b1) if __name__ == "__main__": if test(): From c405dfd756bcdba863eb36b00ce819fa55ff1202 Mon Sep 17 00:00:00 2001 From: nihui Date: Thu, 28 Jul 2022 10:28:47 +0800 Subject: [PATCH 15/18] use sigmoid mask --- tools/pnnx/tests/ncnn/test_torchvision_DeformConv2d.py | 2 +- tools/pnnx/tests/test_torchvision_DeformConv2d.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/pnnx/tests/ncnn/test_torchvision_DeformConv2d.py b/tools/pnnx/tests/ncnn/test_torchvision_DeformConv2d.py index adab655f1e8..e7eba10168d 100644 --- a/tools/pnnx/tests/ncnn/test_torchvision_DeformConv2d.py +++ b/tools/pnnx/tests/ncnn/test_torchvision_DeformConv2d.py @@ -31,7 +31,7 @@ def forward(self, x): offset = self.conv_0(x) x1 = self.conv_1(x, offset) - mask = self.conv_2(x) + mask = F.sigmoid(self.conv_2(x)) x2 = self.conv_3(x, offset, mask) return x1, x2 diff --git a/tools/pnnx/tests/test_torchvision_DeformConv2d.py b/tools/pnnx/tests/test_torchvision_DeformConv2d.py index 286cca7583a..124a9660c95 100644 --- a/tools/pnnx/tests/test_torchvision_DeformConv2d.py +++ b/tools/pnnx/tests/test_torchvision_DeformConv2d.py @@ -31,7 +31,7 @@ def forward(self, x): offset = self.conv_0(x) x1 = self.conv_1(x, offset) - mask = self.conv_2(x) + mask = F.sigmoid(self.conv_2(x)) x2 = self.conv_3(x, offset, mask) return x1, x2 From 958c8b06039066e090b5805b14f47521a5c0fc80 Mon Sep 17 00:00:00 2001 From: miemie2013 <53960695+miemie2013@users.noreply.github.com> Date: Wed, 10 Aug 2022 16:56:48 +0800 Subject: [PATCH 16/18] Optimize x86 DeformableConv2D --- src/layer/x86/deformableconv2d_pack16.h | 433 +++++++++ src/layer/x86/deformableconv2d_pack16to1.h | 368 ++++++++ src/layer/x86/deformableconv2d_pack16to4.h | 433 +++++++++ src/layer/x86/deformableconv2d_pack16to8.h | 433 +++++++++ src/layer/x86/deformableconv2d_pack1to16.h | 193 ++++ src/layer/x86/deformableconv2d_pack1to4.h | 193 ++++ src/layer/x86/deformableconv2d_pack1to8.h | 193 ++++ src/layer/x86/deformableconv2d_pack4.h | 241 +++++ src/layer/x86/deformableconv2d_pack4to1.h | 209 +++++ src/layer/x86/deformableconv2d_pack4to16.h | 241 +++++ src/layer/x86/deformableconv2d_pack4to8.h | 241 +++++ src/layer/x86/deformableconv2d_pack8.h | 305 ++++++ src/layer/x86/deformableconv2d_pack8to1.h | 257 +++++ src/layer/x86/deformableconv2d_pack8to16.h | 305 ++++++ src/layer/x86/deformableconv2d_pack8to4.h | 305 ++++++ src/layer/x86/deformableconv2d_sgemm.h | 134 +++ src/layer/x86/deformableconv2d_sgemm_pack16.h | 178 ++++ .../x86/deformableconv2d_sgemm_pack16to1.h | 178 ++++ .../x86/deformableconv2d_sgemm_pack16to4.h | 178 ++++ .../x86/deformableconv2d_sgemm_pack16to8.h | 178 ++++ .../x86/deformableconv2d_sgemm_pack1to16.h | 134 +++ .../x86/deformableconv2d_sgemm_pack1to4.h | 134 +++ .../x86/deformableconv2d_sgemm_pack1to8.h | 134 +++ src/layer/x86/deformableconv2d_sgemm_pack4.h | 178 ++++ .../x86/deformableconv2d_sgemm_pack4to1.h | 178 ++++ .../x86/deformableconv2d_sgemm_pack4to16.h | 178 ++++ .../x86/deformableconv2d_sgemm_pack4to8.h | 178 ++++ src/layer/x86/deformableconv2d_sgemm_pack8.h | 178 ++++ .../x86/deformableconv2d_sgemm_pack8to1.h | 178 ++++ .../x86/deformableconv2d_sgemm_pack8to16.h | 178 ++++ .../x86/deformableconv2d_sgemm_pack8to4.h | 178 ++++ src/layer/x86/deformableconv2d_x86.cpp | 881 +++++++++++++++--- src/layer/x86/deformableconv2d_x86.h | 6 +- tests/test_deformableconv2d.cpp | 18 +- 34 files changed, 7771 insertions(+), 156 deletions(-) create mode 100644 src/layer/x86/deformableconv2d_pack16.h create mode 100644 src/layer/x86/deformableconv2d_pack16to1.h create mode 100644 src/layer/x86/deformableconv2d_pack16to4.h create mode 100644 src/layer/x86/deformableconv2d_pack16to8.h create mode 100644 src/layer/x86/deformableconv2d_pack1to16.h create mode 100644 src/layer/x86/deformableconv2d_pack1to4.h create mode 100644 src/layer/x86/deformableconv2d_pack1to8.h create mode 100644 src/layer/x86/deformableconv2d_pack4.h create mode 100644 src/layer/x86/deformableconv2d_pack4to1.h create mode 100644 src/layer/x86/deformableconv2d_pack4to16.h create mode 100644 src/layer/x86/deformableconv2d_pack4to8.h create mode 100644 src/layer/x86/deformableconv2d_pack8.h create mode 100644 src/layer/x86/deformableconv2d_pack8to1.h create mode 100644 src/layer/x86/deformableconv2d_pack8to16.h create mode 100644 src/layer/x86/deformableconv2d_pack8to4.h create mode 100644 src/layer/x86/deformableconv2d_sgemm.h create mode 100644 src/layer/x86/deformableconv2d_sgemm_pack16.h create mode 100644 src/layer/x86/deformableconv2d_sgemm_pack16to1.h create mode 100644 src/layer/x86/deformableconv2d_sgemm_pack16to4.h create mode 100644 src/layer/x86/deformableconv2d_sgemm_pack16to8.h create mode 100644 src/layer/x86/deformableconv2d_sgemm_pack1to16.h create mode 100644 src/layer/x86/deformableconv2d_sgemm_pack1to4.h create mode 100644 src/layer/x86/deformableconv2d_sgemm_pack1to8.h create mode 100644 src/layer/x86/deformableconv2d_sgemm_pack4.h create mode 100644 src/layer/x86/deformableconv2d_sgemm_pack4to1.h create mode 100644 src/layer/x86/deformableconv2d_sgemm_pack4to16.h create mode 100644 src/layer/x86/deformableconv2d_sgemm_pack4to8.h create mode 100644 src/layer/x86/deformableconv2d_sgemm_pack8.h create mode 100644 src/layer/x86/deformableconv2d_sgemm_pack8to1.h create mode 100644 src/layer/x86/deformableconv2d_sgemm_pack8to16.h create mode 100644 src/layer/x86/deformableconv2d_sgemm_pack8to4.h diff --git a/src/layer/x86/deformableconv2d_pack16.h b/src/layer/x86/deformableconv2d_pack16.h new file mode 100644 index 00000000000..7de08bc986d --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack16.h @@ -0,0 +1,433 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 16; + const int out_elempack = 16; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m512 _sum = _mm512_setzero_ps(); + if (bias_data_ptr) + _sum = _mm512_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val_channel0 = _mm512_loadu_ps(zeros_ptr); + __m512 _val_channel1 = _val_channel0; + __m512 _val_channel2 = _val_channel0; + __m512 _val_channel3 = _val_channel0; + __m512 _val_channel4 = _val_channel0; + __m512 _val_channel5 = _val_channel0; + __m512 _val_channel6 = _val_channel0; + __m512 _val_channel7 = _val_channel0; + __m512 _val_channel8 = _val_channel0; + __m512 _val_channel9 = _val_channel0; + __m512 _val_channela = _val_channel0; + __m512 _val_channelb = _val_channel0; + __m512 _val_channelc = _val_channel0; + __m512 _val_channeld = _val_channel0; + __m512 _val_channele = _val_channel0; + __m512 _val_channelf = _val_channel0; + if (cond) + { + __m512 _v1_channel0 = _val_channel0; + __m512 _v1_channel1 = _val_channel0; + __m512 _v1_channel2 = _val_channel0; + __m512 _v1_channel3 = _val_channel0; + __m512 _v1_channel4 = _val_channel0; + __m512 _v1_channel5 = _val_channel0; + __m512 _v1_channel6 = _val_channel0; + __m512 _v1_channel7 = _val_channel0; + __m512 _v1_channel8 = _val_channel0; + __m512 _v1_channel9 = _val_channel0; + __m512 _v1_channela = _val_channel0; + __m512 _v1_channelb = _val_channel0; + __m512 _v1_channelc = _val_channel0; + __m512 _v1_channeld = _val_channel0; + __m512 _v1_channele = _val_channel0; + __m512 _v1_channelf = _val_channel0; + __m512 _v2_channel0 = _val_channel0; + __m512 _v2_channel1 = _val_channel0; + __m512 _v2_channel2 = _val_channel0; + __m512 _v2_channel3 = _val_channel0; + __m512 _v2_channel4 = _val_channel0; + __m512 _v2_channel5 = _val_channel0; + __m512 _v2_channel6 = _val_channel0; + __m512 _v2_channel7 = _val_channel0; + __m512 _v2_channel8 = _val_channel0; + __m512 _v2_channel9 = _val_channel0; + __m512 _v2_channela = _val_channel0; + __m512 _v2_channelb = _val_channel0; + __m512 _v2_channelc = _val_channel0; + __m512 _v2_channeld = _val_channel0; + __m512 _v2_channele = _val_channel0; + __m512 _v2_channelf = _val_channel0; + __m512 _v3_channel0 = _val_channel0; + __m512 _v3_channel1 = _val_channel0; + __m512 _v3_channel2 = _val_channel0; + __m512 _v3_channel3 = _val_channel0; + __m512 _v3_channel4 = _val_channel0; + __m512 _v3_channel5 = _val_channel0; + __m512 _v3_channel6 = _val_channel0; + __m512 _v3_channel7 = _val_channel0; + __m512 _v3_channel8 = _val_channel0; + __m512 _v3_channel9 = _val_channel0; + __m512 _v3_channela = _val_channel0; + __m512 _v3_channelb = _val_channel0; + __m512 _v3_channelc = _val_channel0; + __m512 _v3_channeld = _val_channel0; + __m512 _v3_channele = _val_channel0; + __m512 _v3_channelf = _val_channel0; + __m512 _v4_channel0 = _val_channel0; + __m512 _v4_channel1 = _val_channel0; + __m512 _v4_channel2 = _val_channel0; + __m512 _v4_channel3 = _val_channel0; + __m512 _v4_channel4 = _val_channel0; + __m512 _v4_channel5 = _val_channel0; + __m512 _v4_channel6 = _val_channel0; + __m512 _v4_channel7 = _val_channel0; + __m512 _v4_channel8 = _val_channel0; + __m512 _v4_channel9 = _val_channel0; + __m512 _v4_channela = _val_channel0; + __m512 _v4_channelb = _val_channel0; + __m512 _v4_channelc = _val_channel0; + __m512 _v4_channeld = _val_channel0; + __m512 _v4_channele = _val_channel0; + __m512 _v4_channelf = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack]); + _v1_channel1 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 1]); + _v1_channel2 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 2]); + _v1_channel3 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 3]); + _v1_channel4 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 4]); + _v1_channel5 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 5]); + _v1_channel6 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 6]); + _v1_channel7 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 7]); + _v1_channel8 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 8]); + _v1_channel9 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 9]); + _v1_channela = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 10]); + _v1_channelb = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 11]); + _v1_channelc = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 12]); + _v1_channeld = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 13]); + _v1_channele = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 14]); + _v1_channelf = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 15]); + } + if (v2_cond) + { + _v2_channel0 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack]); + _v2_channel1 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 1]); + _v2_channel2 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 2]); + _v2_channel3 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 3]); + _v2_channel4 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 4]); + _v2_channel5 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 5]); + _v2_channel6 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 6]); + _v2_channel7 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 7]); + _v2_channel8 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 8]); + _v2_channel9 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 9]); + _v2_channela = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 10]); + _v2_channelb = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 11]); + _v2_channelc = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 12]); + _v2_channeld = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 13]); + _v2_channele = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 14]); + _v2_channelf = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 15]); + } + if (v3_cond) + { + _v3_channel0 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack]); + _v3_channel1 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 1]); + _v3_channel2 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 2]); + _v3_channel3 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 3]); + _v3_channel4 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 4]); + _v3_channel5 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 5]); + _v3_channel6 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 6]); + _v3_channel7 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 7]); + _v3_channel8 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 8]); + _v3_channel9 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 9]); + _v3_channela = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 10]); + _v3_channelb = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 11]); + _v3_channelc = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 12]); + _v3_channeld = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 13]); + _v3_channele = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 14]); + _v3_channelf = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 15]); + } + if (v4_cond) + { + _v4_channel0 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack]); + _v4_channel1 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 1]); + _v4_channel2 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 2]); + _v4_channel3 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 3]); + _v4_channel4 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 4]); + _v4_channel5 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 5]); + _v4_channel6 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 6]); + _v4_channel7 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 7]); + _v4_channel8 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 8]); + _v4_channel9 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 9]); + _v4_channela = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 10]); + _v4_channelb = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 11]); + _v4_channelc = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 12]); + _v4_channeld = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 13]); + _v4_channele = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 14]); + _v4_channelf = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 15]); + } + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val_channel0 = _mm512_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm512_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm512_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm512_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm512_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm512_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm512_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm512_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v4_channel7, _w4, _val_channel7); + _val_channel8 = _mm512_fmadd_ps(_v1_channel8, _w1, _val_channel8); + _val_channel8 = _mm512_fmadd_ps(_v2_channel8, _w2, _val_channel8); + _val_channel8 = _mm512_fmadd_ps(_v3_channel8, _w3, _val_channel8); + _val_channel8 = _mm512_fmadd_ps(_v4_channel8, _w4, _val_channel8); + _val_channel9 = _mm512_fmadd_ps(_v1_channel9, _w1, _val_channel9); + _val_channel9 = _mm512_fmadd_ps(_v2_channel9, _w2, _val_channel9); + _val_channel9 = _mm512_fmadd_ps(_v3_channel9, _w3, _val_channel9); + _val_channel9 = _mm512_fmadd_ps(_v4_channel9, _w4, _val_channel9); + _val_channela = _mm512_fmadd_ps(_v1_channela, _w1, _val_channela); + _val_channela = _mm512_fmadd_ps(_v2_channela, _w2, _val_channela); + _val_channela = _mm512_fmadd_ps(_v3_channela, _w3, _val_channela); + _val_channela = _mm512_fmadd_ps(_v4_channela, _w4, _val_channela); + _val_channelb = _mm512_fmadd_ps(_v1_channelb, _w1, _val_channelb); + _val_channelb = _mm512_fmadd_ps(_v2_channelb, _w2, _val_channelb); + _val_channelb = _mm512_fmadd_ps(_v3_channelb, _w3, _val_channelb); + _val_channelb = _mm512_fmadd_ps(_v4_channelb, _w4, _val_channelb); + _val_channelc = _mm512_fmadd_ps(_v1_channelc, _w1, _val_channelc); + _val_channelc = _mm512_fmadd_ps(_v2_channelc, _w2, _val_channelc); + _val_channelc = _mm512_fmadd_ps(_v3_channelc, _w3, _val_channelc); + _val_channelc = _mm512_fmadd_ps(_v4_channelc, _w4, _val_channelc); + _val_channeld = _mm512_fmadd_ps(_v1_channeld, _w1, _val_channeld); + _val_channeld = _mm512_fmadd_ps(_v2_channeld, _w2, _val_channeld); + _val_channeld = _mm512_fmadd_ps(_v3_channeld, _w3, _val_channeld); + _val_channeld = _mm512_fmadd_ps(_v4_channeld, _w4, _val_channeld); + _val_channele = _mm512_fmadd_ps(_v1_channele, _w1, _val_channele); + _val_channele = _mm512_fmadd_ps(_v2_channele, _w2, _val_channele); + _val_channele = _mm512_fmadd_ps(_v3_channele, _w3, _val_channele); + _val_channele = _mm512_fmadd_ps(_v4_channele, _w4, _val_channele); + _val_channelf = _mm512_fmadd_ps(_v1_channelf, _w1, _val_channelf); + _val_channelf = _mm512_fmadd_ps(_v2_channelf, _w2, _val_channelf); + _val_channelf = _mm512_fmadd_ps(_v3_channelf, _w3, _val_channelf); + _val_channelf = _mm512_fmadd_ps(_v4_channelf, _w4, _val_channelf); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val_channel0 = _mm512_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm512_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm512_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm512_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm512_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm512_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm512_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm512_mul_ps(_val_channel7, _mask); + _val_channel8 = _mm512_mul_ps(_val_channel8, _mask); + _val_channel9 = _mm512_mul_ps(_val_channel9, _mask); + _val_channela = _mm512_mul_ps(_val_channela, _mask); + _val_channelb = _mm512_mul_ps(_val_channelb, _mask); + _val_channelc = _mm512_mul_ps(_val_channelc, _mask); + _val_channeld = _mm512_mul_ps(_val_channeld, _mask); + _val_channele = _mm512_mul_ps(_val_channele, _mask); + _val_channelf = _mm512_mul_ps(_val_channelf, _mask); + } + __m512 _conv_w0 = _mm512_load_ps(kptr); + __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm512_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack + __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm512_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m512 _conv_w4 = _mm512_load_ps(kptr + 64); // 4 * out_elempack + __m512 _conv_w5 = _mm512_load_ps(kptr + 80); // 5 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm512_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m512 _conv_w6 = _mm512_load_ps(kptr + 96); // 6 * out_elempack + __m512 _conv_w7 = _mm512_load_ps(kptr + 112); // 7 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm512_fmadd_ps(_val_channel7, _conv_w7, _sum); + __m512 _conv_w8 = _mm512_load_ps(kptr + 128); // 8 * out_elempack + __m512 _conv_w9 = _mm512_load_ps(kptr + 144); // 9 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel8, _conv_w8, _sum); + _sum = _mm512_fmadd_ps(_val_channel9, _conv_w9, _sum); + __m512 _conv_wa = _mm512_load_ps(kptr + 160); // 10 * out_elempack + __m512 _conv_wb = _mm512_load_ps(kptr + 176); // 11 * out_elempack + _sum = _mm512_fmadd_ps(_val_channela, _conv_wa, _sum); + _sum = _mm512_fmadd_ps(_val_channelb, _conv_wb, _sum); + __m512 _conv_wc = _mm512_load_ps(kptr + 192); // 12 * out_elempack + __m512 _conv_wd = _mm512_load_ps(kptr + 208); // 13 * out_elempack + _sum = _mm512_fmadd_ps(_val_channelc, _conv_wc, _sum); + _sum = _mm512_fmadd_ps(_val_channeld, _conv_wd, _sum); + __m512 _conv_we = _mm512_load_ps(kptr + 224); // 14 * out_elempack + __m512 _conv_wf = _mm512_load_ps(kptr + 240); // 15 * out_elempack + _sum = _mm512_fmadd_ps(_val_channele, _conv_we, _sum); + _sum = _mm512_fmadd_ps(_val_channelf, _conv_wf, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx512(_sum, activation_type, activation_params); + _mm512_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack16to1.h b/src/layer/x86/deformableconv2d_pack16to1.h new file mode 100644 index 00000000000..f900464ad4e --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack16to1.h @@ -0,0 +1,368 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack16to1_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 16; + const int out_elempack = 1; + const int wstep = out_elempack * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + float _sum = 0.f; + if (bias_data_ptr) + _sum = *(bias_data_ptr + oc); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + float _val_channel0 = 0.f; + float _val_channel1 = _val_channel0; + float _val_channel2 = _val_channel0; + float _val_channel3 = _val_channel0; + float _val_channel4 = _val_channel0; + float _val_channel5 = _val_channel0; + float _val_channel6 = _val_channel0; + float _val_channel7 = _val_channel0; + float _val_channel8 = _val_channel0; + float _val_channel9 = _val_channel0; + float _val_channela = _val_channel0; + float _val_channelb = _val_channel0; + float _val_channelc = _val_channel0; + float _val_channeld = _val_channel0; + float _val_channele = _val_channel0; + float _val_channelf = _val_channel0; + if (cond) + { + float _v1_channel0 = _val_channel0; + float _v1_channel1 = _val_channel0; + float _v1_channel2 = _val_channel0; + float _v1_channel3 = _val_channel0; + float _v1_channel4 = _val_channel0; + float _v1_channel5 = _val_channel0; + float _v1_channel6 = _val_channel0; + float _v1_channel7 = _val_channel0; + float _v1_channel8 = _val_channel0; + float _v1_channel9 = _val_channel0; + float _v1_channela = _val_channel0; + float _v1_channelb = _val_channel0; + float _v1_channelc = _val_channel0; + float _v1_channeld = _val_channel0; + float _v1_channele = _val_channel0; + float _v1_channelf = _val_channel0; + float _v2_channel0 = _val_channel0; + float _v2_channel1 = _val_channel0; + float _v2_channel2 = _val_channel0; + float _v2_channel3 = _val_channel0; + float _v2_channel4 = _val_channel0; + float _v2_channel5 = _val_channel0; + float _v2_channel6 = _val_channel0; + float _v2_channel7 = _val_channel0; + float _v2_channel8 = _val_channel0; + float _v2_channel9 = _val_channel0; + float _v2_channela = _val_channel0; + float _v2_channelb = _val_channel0; + float _v2_channelc = _val_channel0; + float _v2_channeld = _val_channel0; + float _v2_channele = _val_channel0; + float _v2_channelf = _val_channel0; + float _v3_channel0 = _val_channel0; + float _v3_channel1 = _val_channel0; + float _v3_channel2 = _val_channel0; + float _v3_channel3 = _val_channel0; + float _v3_channel4 = _val_channel0; + float _v3_channel5 = _val_channel0; + float _v3_channel6 = _val_channel0; + float _v3_channel7 = _val_channel0; + float _v3_channel8 = _val_channel0; + float _v3_channel9 = _val_channel0; + float _v3_channela = _val_channel0; + float _v3_channelb = _val_channel0; + float _v3_channelc = _val_channel0; + float _v3_channeld = _val_channel0; + float _v3_channele = _val_channel0; + float _v3_channelf = _val_channel0; + float _v4_channel0 = _val_channel0; + float _v4_channel1 = _val_channel0; + float _v4_channel2 = _val_channel0; + float _v4_channel3 = _val_channel0; + float _v4_channel4 = _val_channel0; + float _v4_channel5 = _val_channel0; + float _v4_channel6 = _val_channel0; + float _v4_channel7 = _val_channel0; + float _v4_channel8 = _val_channel0; + float _v4_channel9 = _val_channel0; + float _v4_channela = _val_channel0; + float _v4_channelb = _val_channel0; + float _v4_channelc = _val_channel0; + float _v4_channeld = _val_channel0; + float _v4_channele = _val_channel0; + float _v4_channelf = _val_channel0; + if (v1_cond) + { + _v1_channel0 = *(data_im_ptr + v1_pos * elempack); + _v1_channel1 = *(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = *(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = *(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = *(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = *(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = *(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = *(data_im_ptr + v1_pos * elempack + 7); + _v1_channel8 = *(data_im_ptr + v1_pos * elempack + 8); + _v1_channel9 = *(data_im_ptr + v1_pos * elempack + 9); + _v1_channela = *(data_im_ptr + v1_pos * elempack + 10); + _v1_channelb = *(data_im_ptr + v1_pos * elempack + 11); + _v1_channelc = *(data_im_ptr + v1_pos * elempack + 12); + _v1_channeld = *(data_im_ptr + v1_pos * elempack + 13); + _v1_channele = *(data_im_ptr + v1_pos * elempack + 14); + _v1_channelf = *(data_im_ptr + v1_pos * elempack + 15); + } + if (v2_cond) + { + _v2_channel0 = *(data_im_ptr + v2_pos * elempack); + _v2_channel1 = *(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = *(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = *(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = *(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = *(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = *(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = *(data_im_ptr + v2_pos * elempack + 7); + _v2_channel8 = *(data_im_ptr + v2_pos * elempack + 8); + _v2_channel9 = *(data_im_ptr + v2_pos * elempack + 9); + _v2_channela = *(data_im_ptr + v2_pos * elempack + 10); + _v2_channelb = *(data_im_ptr + v2_pos * elempack + 11); + _v2_channelc = *(data_im_ptr + v2_pos * elempack + 12); + _v2_channeld = *(data_im_ptr + v2_pos * elempack + 13); + _v2_channele = *(data_im_ptr + v2_pos * elempack + 14); + _v2_channelf = *(data_im_ptr + v2_pos * elempack + 15); + } + if (v3_cond) + { + _v3_channel0 = *(data_im_ptr + v3_pos * elempack); + _v3_channel1 = *(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = *(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = *(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = *(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = *(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = *(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = *(data_im_ptr + v3_pos * elempack + 7); + _v3_channel8 = *(data_im_ptr + v3_pos * elempack + 8); + _v3_channel9 = *(data_im_ptr + v3_pos * elempack + 9); + _v3_channela = *(data_im_ptr + v3_pos * elempack + 10); + _v3_channelb = *(data_im_ptr + v3_pos * elempack + 11); + _v3_channelc = *(data_im_ptr + v3_pos * elempack + 12); + _v3_channeld = *(data_im_ptr + v3_pos * elempack + 13); + _v3_channele = *(data_im_ptr + v3_pos * elempack + 14); + _v3_channelf = *(data_im_ptr + v3_pos * elempack + 15); + } + if (v4_cond) + { + _v4_channel0 = *(data_im_ptr + v4_pos * elempack); + _v4_channel1 = *(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = *(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = *(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = *(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = *(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = *(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = *(data_im_ptr + v4_pos * elempack + 7); + _v4_channel8 = *(data_im_ptr + v4_pos * elempack + 8); + _v4_channel9 = *(data_im_ptr + v4_pos * elempack + 9); + _v4_channela = *(data_im_ptr + v4_pos * elempack + 10); + _v4_channelb = *(data_im_ptr + v4_pos * elempack + 11); + _v4_channelc = *(data_im_ptr + v4_pos * elempack + 12); + _v4_channeld = *(data_im_ptr + v4_pos * elempack + 13); + _v4_channele = *(data_im_ptr + v4_pos * elempack + 14); + _v4_channelf = *(data_im_ptr + v4_pos * elempack + 15); + } + _val_channel0 = w1 * _v1_channel0 + w2 * _v2_channel0 + w3 * _v3_channel0 + w4 * _v4_channel0; + _val_channel1 = w1 * _v1_channel1 + w2 * _v2_channel1 + w3 * _v3_channel1 + w4 * _v4_channel1; + _val_channel2 = w1 * _v1_channel2 + w2 * _v2_channel2 + w3 * _v3_channel2 + w4 * _v4_channel2; + _val_channel3 = w1 * _v1_channel3 + w2 * _v2_channel3 + w3 * _v3_channel3 + w4 * _v4_channel3; + _val_channel4 = w1 * _v1_channel4 + w2 * _v2_channel4 + w3 * _v3_channel4 + w4 * _v4_channel4; + _val_channel5 = w1 * _v1_channel5 + w2 * _v2_channel5 + w3 * _v3_channel5 + w4 * _v4_channel5; + _val_channel6 = w1 * _v1_channel6 + w2 * _v2_channel6 + w3 * _v3_channel6 + w4 * _v4_channel6; + _val_channel7 = w1 * _v1_channel7 + w2 * _v2_channel7 + w3 * _v3_channel7 + w4 * _v4_channel7; + _val_channel8 = w1 * _v1_channel8 + w2 * _v2_channel8 + w3 * _v3_channel8 + w4 * _v4_channel8; + _val_channel9 = w1 * _v1_channel9 + w2 * _v2_channel9 + w3 * _v3_channel9 + w4 * _v4_channel9; + _val_channela = w1 * _v1_channela + w2 * _v2_channela + w3 * _v3_channela + w4 * _v4_channela; + _val_channelb = w1 * _v1_channelb + w2 * _v2_channelb + w3 * _v3_channelb + w4 * _v4_channelb; + _val_channelc = w1 * _v1_channelc + w2 * _v2_channelc + w3 * _v3_channelc + w4 * _v4_channelc; + _val_channeld = w1 * _v1_channeld + w2 * _v2_channeld + w3 * _v3_channeld + w4 * _v4_channeld; + _val_channele = w1 * _v1_channele + w2 * _v2_channele + w3 * _v3_channele + w4 * _v4_channele; + _val_channelf = w1 * _v1_channelf + w2 * _v2_channelf + w3 * _v3_channelf + w4 * _v4_channelf; + } + if (has_mask) + { + _val_channel0 *= mask_; + _val_channel1 *= mask_; + _val_channel2 *= mask_; + _val_channel3 *= mask_; + _val_channel4 *= mask_; + _val_channel5 *= mask_; + _val_channel6 *= mask_; + _val_channel7 *= mask_; + _val_channel8 *= mask_; + _val_channel9 *= mask_; + _val_channela *= mask_; + _val_channelb *= mask_; + _val_channelc *= mask_; + _val_channeld *= mask_; + _val_channele *= mask_; + _val_channelf *= mask_; + } + float _conv_w0 = *(kptr); + float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack + _sum += (_val_channel0 * _conv_w0); + _sum += (_val_channel1 * _conv_w1); + float _conv_w2 = *(kptr + 2); // 2 * out_elempack + float _conv_w3 = *(kptr + 3); // 3 * out_elempack + _sum += (_val_channel2 * _conv_w2); + _sum += (_val_channel3 * _conv_w3); + float _conv_w4 = *(kptr + 4); // 4 * out_elempack + float _conv_w5 = *(kptr + 5); // 5 * out_elempack + _sum += (_val_channel4 * _conv_w4); + _sum += (_val_channel5 * _conv_w5); + float _conv_w6 = *(kptr + 6); // 6 * out_elempack + float _conv_w7 = *(kptr + 7); // 7 * out_elempack + _sum += (_val_channel6 * _conv_w6); + _sum += (_val_channel7 * _conv_w7); + float _conv_w8 = *(kptr + 8); // 8 * out_elempack + float _conv_w9 = *(kptr + 9); // 9 * out_elempack + _sum += (_val_channel8 * _conv_w8); + _sum += (_val_channel9 * _conv_w9); + float _conv_wa = *(kptr + 10); // 10 * out_elempack + float _conv_wb = *(kptr + 11); // 11 * out_elempack + _sum += (_val_channela * _conv_wa); + _sum += (_val_channelb * _conv_wb); + float _conv_wc = *(kptr + 12); // 12 * out_elempack + float _conv_wd = *(kptr + 13); // 13 * out_elempack + _sum += (_val_channelc * _conv_wc); + _sum += (_val_channeld * _conv_wd); + float _conv_we = *(kptr + 14); // 14 * out_elempack + float _conv_wf = *(kptr + 15); // 15 * out_elempack + _sum += (_val_channele * _conv_we); + _sum += (_val_channelf * _conv_wf); + kptr += wstep; + } + } + } + _sum = activation_ss(_sum, activation_type, activation_params); + *(outptr + h_col * outw + w_col) = _sum; + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack16to4.h b/src/layer/x86/deformableconv2d_pack16to4.h new file mode 100644 index 00000000000..8307def9948 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack16to4.h @@ -0,0 +1,433 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack16to4_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 16; + const int out_elempack = 4; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m128 _sum = _mm_setzero_ps(); + if (bias_data_ptr) + _sum = _mm_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val_channel0 = _mm_loadu_ps(zeros_ptr); + __m128 _val_channel1 = _val_channel0; + __m128 _val_channel2 = _val_channel0; + __m128 _val_channel3 = _val_channel0; + __m128 _val_channel4 = _val_channel0; + __m128 _val_channel5 = _val_channel0; + __m128 _val_channel6 = _val_channel0; + __m128 _val_channel7 = _val_channel0; + __m128 _val_channel8 = _val_channel0; + __m128 _val_channel9 = _val_channel0; + __m128 _val_channela = _val_channel0; + __m128 _val_channelb = _val_channel0; + __m128 _val_channelc = _val_channel0; + __m128 _val_channeld = _val_channel0; + __m128 _val_channele = _val_channel0; + __m128 _val_channelf = _val_channel0; + if (cond) + { + __m128 _v1_channel0 = _val_channel0; + __m128 _v1_channel1 = _val_channel0; + __m128 _v1_channel2 = _val_channel0; + __m128 _v1_channel3 = _val_channel0; + __m128 _v1_channel4 = _val_channel0; + __m128 _v1_channel5 = _val_channel0; + __m128 _v1_channel6 = _val_channel0; + __m128 _v1_channel7 = _val_channel0; + __m128 _v1_channel8 = _val_channel0; + __m128 _v1_channel9 = _val_channel0; + __m128 _v1_channela = _val_channel0; + __m128 _v1_channelb = _val_channel0; + __m128 _v1_channelc = _val_channel0; + __m128 _v1_channeld = _val_channel0; + __m128 _v1_channele = _val_channel0; + __m128 _v1_channelf = _val_channel0; + __m128 _v2_channel0 = _val_channel0; + __m128 _v2_channel1 = _val_channel0; + __m128 _v2_channel2 = _val_channel0; + __m128 _v2_channel3 = _val_channel0; + __m128 _v2_channel4 = _val_channel0; + __m128 _v2_channel5 = _val_channel0; + __m128 _v2_channel6 = _val_channel0; + __m128 _v2_channel7 = _val_channel0; + __m128 _v2_channel8 = _val_channel0; + __m128 _v2_channel9 = _val_channel0; + __m128 _v2_channela = _val_channel0; + __m128 _v2_channelb = _val_channel0; + __m128 _v2_channelc = _val_channel0; + __m128 _v2_channeld = _val_channel0; + __m128 _v2_channele = _val_channel0; + __m128 _v2_channelf = _val_channel0; + __m128 _v3_channel0 = _val_channel0; + __m128 _v3_channel1 = _val_channel0; + __m128 _v3_channel2 = _val_channel0; + __m128 _v3_channel3 = _val_channel0; + __m128 _v3_channel4 = _val_channel0; + __m128 _v3_channel5 = _val_channel0; + __m128 _v3_channel6 = _val_channel0; + __m128 _v3_channel7 = _val_channel0; + __m128 _v3_channel8 = _val_channel0; + __m128 _v3_channel9 = _val_channel0; + __m128 _v3_channela = _val_channel0; + __m128 _v3_channelb = _val_channel0; + __m128 _v3_channelc = _val_channel0; + __m128 _v3_channeld = _val_channel0; + __m128 _v3_channele = _val_channel0; + __m128 _v3_channelf = _val_channel0; + __m128 _v4_channel0 = _val_channel0; + __m128 _v4_channel1 = _val_channel0; + __m128 _v4_channel2 = _val_channel0; + __m128 _v4_channel3 = _val_channel0; + __m128 _v4_channel4 = _val_channel0; + __m128 _v4_channel5 = _val_channel0; + __m128 _v4_channel6 = _val_channel0; + __m128 _v4_channel7 = _val_channel0; + __m128 _v4_channel8 = _val_channel0; + __m128 _v4_channel9 = _val_channel0; + __m128 _v4_channela = _val_channel0; + __m128 _v4_channelb = _val_channel0; + __m128 _v4_channelc = _val_channel0; + __m128 _v4_channeld = _val_channel0; + __m128 _v4_channele = _val_channel0; + __m128 _v4_channelf = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm_load1_ps(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 7); + _v1_channel8 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 8); + _v1_channel9 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 9); + _v1_channela = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 10); + _v1_channelb = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 11); + _v1_channelc = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 12); + _v1_channeld = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 13); + _v1_channele = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 14); + _v1_channelf = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 15); + } + if (v2_cond) + { + _v2_channel0 = _mm_load1_ps(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 7); + _v2_channel8 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 8); + _v2_channel9 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 9); + _v2_channela = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 10); + _v2_channelb = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 11); + _v2_channelc = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 12); + _v2_channeld = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 13); + _v2_channele = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 14); + _v2_channelf = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 15); + } + if (v3_cond) + { + _v3_channel0 = _mm_load1_ps(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 7); + _v3_channel8 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 8); + _v3_channel9 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 9); + _v3_channela = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 10); + _v3_channelb = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 11); + _v3_channelc = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 12); + _v3_channeld = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 13); + _v3_channele = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 14); + _v3_channelf = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 15); + } + if (v4_cond) + { + _v4_channel0 = _mm_load1_ps(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 7); + _v4_channel8 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 8); + _v4_channel9 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 9); + _v4_channela = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 10); + _v4_channelb = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 11); + _v4_channelc = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 12); + _v4_channeld = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 13); + _v4_channele = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 14); + _v4_channelf = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 15); + } + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val_channel0 = _mm_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm_comp_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm_comp_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm_comp_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm_comp_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v4_channel7, _w4, _val_channel7); + _val_channel8 = _mm_comp_fmadd_ps(_v1_channel8, _w1, _val_channel8); + _val_channel8 = _mm_comp_fmadd_ps(_v2_channel8, _w2, _val_channel8); + _val_channel8 = _mm_comp_fmadd_ps(_v3_channel8, _w3, _val_channel8); + _val_channel8 = _mm_comp_fmadd_ps(_v4_channel8, _w4, _val_channel8); + _val_channel9 = _mm_comp_fmadd_ps(_v1_channel9, _w1, _val_channel9); + _val_channel9 = _mm_comp_fmadd_ps(_v2_channel9, _w2, _val_channel9); + _val_channel9 = _mm_comp_fmadd_ps(_v3_channel9, _w3, _val_channel9); + _val_channel9 = _mm_comp_fmadd_ps(_v4_channel9, _w4, _val_channel9); + _val_channela = _mm_comp_fmadd_ps(_v1_channela, _w1, _val_channela); + _val_channela = _mm_comp_fmadd_ps(_v2_channela, _w2, _val_channela); + _val_channela = _mm_comp_fmadd_ps(_v3_channela, _w3, _val_channela); + _val_channela = _mm_comp_fmadd_ps(_v4_channela, _w4, _val_channela); + _val_channelb = _mm_comp_fmadd_ps(_v1_channelb, _w1, _val_channelb); + _val_channelb = _mm_comp_fmadd_ps(_v2_channelb, _w2, _val_channelb); + _val_channelb = _mm_comp_fmadd_ps(_v3_channelb, _w3, _val_channelb); + _val_channelb = _mm_comp_fmadd_ps(_v4_channelb, _w4, _val_channelb); + _val_channelc = _mm_comp_fmadd_ps(_v1_channelc, _w1, _val_channelc); + _val_channelc = _mm_comp_fmadd_ps(_v2_channelc, _w2, _val_channelc); + _val_channelc = _mm_comp_fmadd_ps(_v3_channelc, _w3, _val_channelc); + _val_channelc = _mm_comp_fmadd_ps(_v4_channelc, _w4, _val_channelc); + _val_channeld = _mm_comp_fmadd_ps(_v1_channeld, _w1, _val_channeld); + _val_channeld = _mm_comp_fmadd_ps(_v2_channeld, _w2, _val_channeld); + _val_channeld = _mm_comp_fmadd_ps(_v3_channeld, _w3, _val_channeld); + _val_channeld = _mm_comp_fmadd_ps(_v4_channeld, _w4, _val_channeld); + _val_channele = _mm_comp_fmadd_ps(_v1_channele, _w1, _val_channele); + _val_channele = _mm_comp_fmadd_ps(_v2_channele, _w2, _val_channele); + _val_channele = _mm_comp_fmadd_ps(_v3_channele, _w3, _val_channele); + _val_channele = _mm_comp_fmadd_ps(_v4_channele, _w4, _val_channele); + _val_channelf = _mm_comp_fmadd_ps(_v1_channelf, _w1, _val_channelf); + _val_channelf = _mm_comp_fmadd_ps(_v2_channelf, _w2, _val_channelf); + _val_channelf = _mm_comp_fmadd_ps(_v3_channelf, _w3, _val_channelf); + _val_channelf = _mm_comp_fmadd_ps(_v4_channelf, _w4, _val_channelf); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val_channel0 = _mm_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm_mul_ps(_val_channel7, _mask); + _val_channel8 = _mm_mul_ps(_val_channel8, _mask); + _val_channel9 = _mm_mul_ps(_val_channel9, _mask); + _val_channela = _mm_mul_ps(_val_channela, _mask); + _val_channelb = _mm_mul_ps(_val_channelb, _mask); + _val_channelc = _mm_mul_ps(_val_channelc, _mask); + _val_channeld = _mm_mul_ps(_val_channeld, _mask); + _val_channele = _mm_mul_ps(_val_channele, _mask); + _val_channelf = _mm_mul_ps(_val_channelf, _mask); + } + __m128 _conv_w0 = _mm_load_ps(kptr); + __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m128 _conv_w2 = _mm_load_ps(kptr + 8); // 2 * out_elempack + __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m128 _conv_w4 = _mm_load_ps(kptr + 16); // 4 * out_elempack + __m128 _conv_w5 = _mm_load_ps(kptr + 20); // 5 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m128 _conv_w6 = _mm_load_ps(kptr + 24); // 6 * out_elempack + __m128 _conv_w7 = _mm_load_ps(kptr + 28); // 7 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel7, _conv_w7, _sum); + __m128 _conv_w8 = _mm_load_ps(kptr + 32); // 8 * out_elempack + __m128 _conv_w9 = _mm_load_ps(kptr + 36); // 9 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel8, _conv_w8, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel9, _conv_w9, _sum); + __m128 _conv_wa = _mm_load_ps(kptr + 40); // 10 * out_elempack + __m128 _conv_wb = _mm_load_ps(kptr + 44); // 11 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channela, _conv_wa, _sum); + _sum = _mm_comp_fmadd_ps(_val_channelb, _conv_wb, _sum); + __m128 _conv_wc = _mm_load_ps(kptr + 48); // 12 * out_elempack + __m128 _conv_wd = _mm_load_ps(kptr + 52); // 13 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channelc, _conv_wc, _sum); + _sum = _mm_comp_fmadd_ps(_val_channeld, _conv_wd, _sum); + __m128 _conv_we = _mm_load_ps(kptr + 56); // 14 * out_elempack + __m128 _conv_wf = _mm_load_ps(kptr + 60); // 15 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channele, _conv_we, _sum); + _sum = _mm_comp_fmadd_ps(_val_channelf, _conv_wf, _sum); + kptr += wstep; + } + } + } + _sum = activation_sse(_sum, activation_type, activation_params); + _mm_storeu_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack16to8.h b/src/layer/x86/deformableconv2d_pack16to8.h new file mode 100644 index 00000000000..36c7d222abe --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack16to8.h @@ -0,0 +1,433 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack16to8_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 16; + const int out_elempack = 8; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m256 _sum = _mm256_setzero_ps(); + if (bias_data_ptr) + _sum = _mm256_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val_channel0 = _mm256_loadu_ps(zeros_ptr); + __m256 _val_channel1 = _val_channel0; + __m256 _val_channel2 = _val_channel0; + __m256 _val_channel3 = _val_channel0; + __m256 _val_channel4 = _val_channel0; + __m256 _val_channel5 = _val_channel0; + __m256 _val_channel6 = _val_channel0; + __m256 _val_channel7 = _val_channel0; + __m256 _val_channel8 = _val_channel0; + __m256 _val_channel9 = _val_channel0; + __m256 _val_channela = _val_channel0; + __m256 _val_channelb = _val_channel0; + __m256 _val_channelc = _val_channel0; + __m256 _val_channeld = _val_channel0; + __m256 _val_channele = _val_channel0; + __m256 _val_channelf = _val_channel0; + if (cond) + { + __m256 _v1_channel0 = _val_channel0; + __m256 _v1_channel1 = _val_channel0; + __m256 _v1_channel2 = _val_channel0; + __m256 _v1_channel3 = _val_channel0; + __m256 _v1_channel4 = _val_channel0; + __m256 _v1_channel5 = _val_channel0; + __m256 _v1_channel6 = _val_channel0; + __m256 _v1_channel7 = _val_channel0; + __m256 _v1_channel8 = _val_channel0; + __m256 _v1_channel9 = _val_channel0; + __m256 _v1_channela = _val_channel0; + __m256 _v1_channelb = _val_channel0; + __m256 _v1_channelc = _val_channel0; + __m256 _v1_channeld = _val_channel0; + __m256 _v1_channele = _val_channel0; + __m256 _v1_channelf = _val_channel0; + __m256 _v2_channel0 = _val_channel0; + __m256 _v2_channel1 = _val_channel0; + __m256 _v2_channel2 = _val_channel0; + __m256 _v2_channel3 = _val_channel0; + __m256 _v2_channel4 = _val_channel0; + __m256 _v2_channel5 = _val_channel0; + __m256 _v2_channel6 = _val_channel0; + __m256 _v2_channel7 = _val_channel0; + __m256 _v2_channel8 = _val_channel0; + __m256 _v2_channel9 = _val_channel0; + __m256 _v2_channela = _val_channel0; + __m256 _v2_channelb = _val_channel0; + __m256 _v2_channelc = _val_channel0; + __m256 _v2_channeld = _val_channel0; + __m256 _v2_channele = _val_channel0; + __m256 _v2_channelf = _val_channel0; + __m256 _v3_channel0 = _val_channel0; + __m256 _v3_channel1 = _val_channel0; + __m256 _v3_channel2 = _val_channel0; + __m256 _v3_channel3 = _val_channel0; + __m256 _v3_channel4 = _val_channel0; + __m256 _v3_channel5 = _val_channel0; + __m256 _v3_channel6 = _val_channel0; + __m256 _v3_channel7 = _val_channel0; + __m256 _v3_channel8 = _val_channel0; + __m256 _v3_channel9 = _val_channel0; + __m256 _v3_channela = _val_channel0; + __m256 _v3_channelb = _val_channel0; + __m256 _v3_channelc = _val_channel0; + __m256 _v3_channeld = _val_channel0; + __m256 _v3_channele = _val_channel0; + __m256 _v3_channelf = _val_channel0; + __m256 _v4_channel0 = _val_channel0; + __m256 _v4_channel1 = _val_channel0; + __m256 _v4_channel2 = _val_channel0; + __m256 _v4_channel3 = _val_channel0; + __m256 _v4_channel4 = _val_channel0; + __m256 _v4_channel5 = _val_channel0; + __m256 _v4_channel6 = _val_channel0; + __m256 _v4_channel7 = _val_channel0; + __m256 _v4_channel8 = _val_channel0; + __m256 _v4_channel9 = _val_channel0; + __m256 _v4_channela = _val_channel0; + __m256 _v4_channelb = _val_channel0; + __m256 _v4_channelc = _val_channel0; + __m256 _v4_channeld = _val_channel0; + __m256 _v4_channele = _val_channel0; + __m256 _v4_channelf = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 7); + _v1_channel8 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 8); + _v1_channel9 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 9); + _v1_channela = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 10); + _v1_channelb = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 11); + _v1_channelc = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 12); + _v1_channeld = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 13); + _v1_channele = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 14); + _v1_channelf = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 15); + } + if (v2_cond) + { + _v2_channel0 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 7); + _v2_channel8 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 8); + _v2_channel9 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 9); + _v2_channela = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 10); + _v2_channelb = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 11); + _v2_channelc = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 12); + _v2_channeld = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 13); + _v2_channele = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 14); + _v2_channelf = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 15); + } + if (v3_cond) + { + _v3_channel0 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 7); + _v3_channel8 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 8); + _v3_channel9 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 9); + _v3_channela = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 10); + _v3_channelb = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 11); + _v3_channelc = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 12); + _v3_channeld = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 13); + _v3_channele = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 14); + _v3_channelf = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 15); + } + if (v4_cond) + { + _v4_channel0 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 7); + _v4_channel8 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 8); + _v4_channel9 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 9); + _v4_channela = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 10); + _v4_channelb = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 11); + _v4_channelc = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 12); + _v4_channeld = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 13); + _v4_channele = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 14); + _v4_channelf = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 15); + } + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val_channel0 = _mm256_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm256_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm256_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm256_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm256_comp_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm256_comp_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm256_comp_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm256_comp_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v4_channel7, _w4, _val_channel7); + _val_channel8 = _mm256_comp_fmadd_ps(_v1_channel8, _w1, _val_channel8); + _val_channel8 = _mm256_comp_fmadd_ps(_v2_channel8, _w2, _val_channel8); + _val_channel8 = _mm256_comp_fmadd_ps(_v3_channel8, _w3, _val_channel8); + _val_channel8 = _mm256_comp_fmadd_ps(_v4_channel8, _w4, _val_channel8); + _val_channel9 = _mm256_comp_fmadd_ps(_v1_channel9, _w1, _val_channel9); + _val_channel9 = _mm256_comp_fmadd_ps(_v2_channel9, _w2, _val_channel9); + _val_channel9 = _mm256_comp_fmadd_ps(_v3_channel9, _w3, _val_channel9); + _val_channel9 = _mm256_comp_fmadd_ps(_v4_channel9, _w4, _val_channel9); + _val_channela = _mm256_comp_fmadd_ps(_v1_channela, _w1, _val_channela); + _val_channela = _mm256_comp_fmadd_ps(_v2_channela, _w2, _val_channela); + _val_channela = _mm256_comp_fmadd_ps(_v3_channela, _w3, _val_channela); + _val_channela = _mm256_comp_fmadd_ps(_v4_channela, _w4, _val_channela); + _val_channelb = _mm256_comp_fmadd_ps(_v1_channelb, _w1, _val_channelb); + _val_channelb = _mm256_comp_fmadd_ps(_v2_channelb, _w2, _val_channelb); + _val_channelb = _mm256_comp_fmadd_ps(_v3_channelb, _w3, _val_channelb); + _val_channelb = _mm256_comp_fmadd_ps(_v4_channelb, _w4, _val_channelb); + _val_channelc = _mm256_comp_fmadd_ps(_v1_channelc, _w1, _val_channelc); + _val_channelc = _mm256_comp_fmadd_ps(_v2_channelc, _w2, _val_channelc); + _val_channelc = _mm256_comp_fmadd_ps(_v3_channelc, _w3, _val_channelc); + _val_channelc = _mm256_comp_fmadd_ps(_v4_channelc, _w4, _val_channelc); + _val_channeld = _mm256_comp_fmadd_ps(_v1_channeld, _w1, _val_channeld); + _val_channeld = _mm256_comp_fmadd_ps(_v2_channeld, _w2, _val_channeld); + _val_channeld = _mm256_comp_fmadd_ps(_v3_channeld, _w3, _val_channeld); + _val_channeld = _mm256_comp_fmadd_ps(_v4_channeld, _w4, _val_channeld); + _val_channele = _mm256_comp_fmadd_ps(_v1_channele, _w1, _val_channele); + _val_channele = _mm256_comp_fmadd_ps(_v2_channele, _w2, _val_channele); + _val_channele = _mm256_comp_fmadd_ps(_v3_channele, _w3, _val_channele); + _val_channele = _mm256_comp_fmadd_ps(_v4_channele, _w4, _val_channele); + _val_channelf = _mm256_comp_fmadd_ps(_v1_channelf, _w1, _val_channelf); + _val_channelf = _mm256_comp_fmadd_ps(_v2_channelf, _w2, _val_channelf); + _val_channelf = _mm256_comp_fmadd_ps(_v3_channelf, _w3, _val_channelf); + _val_channelf = _mm256_comp_fmadd_ps(_v4_channelf, _w4, _val_channelf); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val_channel0 = _mm256_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm256_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm256_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm256_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm256_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm256_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm256_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm256_mul_ps(_val_channel7, _mask); + _val_channel8 = _mm256_mul_ps(_val_channel8, _mask); + _val_channel9 = _mm256_mul_ps(_val_channel9, _mask); + _val_channela = _mm256_mul_ps(_val_channela, _mask); + _val_channelb = _mm256_mul_ps(_val_channelb, _mask); + _val_channelc = _mm256_mul_ps(_val_channelc, _mask); + _val_channeld = _mm256_mul_ps(_val_channeld, _mask); + _val_channele = _mm256_mul_ps(_val_channele, _mask); + _val_channelf = _mm256_mul_ps(_val_channelf, _mask); + } + __m256 _conv_w0 = _mm256_load_ps(kptr); + __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack + __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m256 _conv_w4 = _mm256_load_ps(kptr + 32); // 4 * out_elempack + __m256 _conv_w5 = _mm256_load_ps(kptr + 40); // 5 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m256 _conv_w6 = _mm256_load_ps(kptr + 48); // 6 * out_elempack + __m256 _conv_w7 = _mm256_load_ps(kptr + 56); // 7 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel7, _conv_w7, _sum); + __m256 _conv_w8 = _mm256_load_ps(kptr + 64); // 8 * out_elempack + __m256 _conv_w9 = _mm256_load_ps(kptr + 72); // 9 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel8, _conv_w8, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel9, _conv_w9, _sum); + __m256 _conv_wa = _mm256_load_ps(kptr + 80); // 10 * out_elempack + __m256 _conv_wb = _mm256_load_ps(kptr + 88); // 11 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channela, _conv_wa, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channelb, _conv_wb, _sum); + __m256 _conv_wc = _mm256_load_ps(kptr + 96); // 12 * out_elempack + __m256 _conv_wd = _mm256_load_ps(kptr + 104); // 13 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channelc, _conv_wc, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channeld, _conv_wd, _sum); + __m256 _conv_we = _mm256_load_ps(kptr + 112); // 14 * out_elempack + __m256 _conv_wf = _mm256_load_ps(kptr + 120); // 15 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channele, _conv_we, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channelf, _conv_wf, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx(_sum, activation_type, activation_params); + _mm256_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack1to16.h b/src/layer/x86/deformableconv2d_pack1to16.h new file mode 100644 index 00000000000..e3f18e84a22 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack1to16.h @@ -0,0 +1,193 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack1to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 1; + const int out_elempack = 16; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m512 _sum = _mm512_setzero_ps(); + if (bias_data_ptr) + _sum = _mm512_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val_channel0 = _mm512_loadu_ps(zeros_ptr); + if (cond) + { + __m512 _v1_channel0 = _val_channel0; + __m512 _v2_channel0 = _val_channel0; + __m512 _v3_channel0 = _val_channel0; + __m512 _v4_channel0 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm512_set1_ps(data_im_ptr[v1_pos]); + } + if (v2_cond) + { + _v2_channel0 = _mm512_set1_ps(data_im_ptr[v2_pos]); + } + if (v3_cond) + { + _v3_channel0 = _mm512_set1_ps(data_im_ptr[v3_pos]); + } + if (v4_cond) + { + _v4_channel0 = _mm512_set1_ps(data_im_ptr[v4_pos]); + } + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val_channel0 = _mm512_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v4_channel0, _w4, _val_channel0); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val_channel0 = _mm512_mul_ps(_val_channel0, _mask); + } + __m512 _conv_w0 = _mm512_load_ps(kptr); + _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx512(_sum, activation_type, activation_params); + _mm512_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack1to4.h b/src/layer/x86/deformableconv2d_pack1to4.h new file mode 100644 index 00000000000..a3fb9a19d4a --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack1to4.h @@ -0,0 +1,193 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack1to4_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 1; + const int out_elempack = 4; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m128 _sum = _mm_setzero_ps(); + if (bias_data_ptr) + _sum = _mm_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val_channel0 = _mm_loadu_ps(zeros_ptr); + if (cond) + { + __m128 _v1_channel0 = _val_channel0; + __m128 _v2_channel0 = _val_channel0; + __m128 _v3_channel0 = _val_channel0; + __m128 _v4_channel0 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm_load1_ps(data_im_ptr + v1_pos); + } + if (v2_cond) + { + _v2_channel0 = _mm_load1_ps(data_im_ptr + v2_pos); + } + if (v3_cond) + { + _v3_channel0 = _mm_load1_ps(data_im_ptr + v3_pos); + } + if (v4_cond) + { + _v4_channel0 = _mm_load1_ps(data_im_ptr + v4_pos); + } + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val_channel0 = _mm_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val_channel0 = _mm_mul_ps(_val_channel0, _mask); + } + __m128 _conv_w0 = _mm_load_ps(kptr); + _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + kptr += wstep; + } + } + } + _sum = activation_sse(_sum, activation_type, activation_params); + _mm_storeu_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack1to8.h b/src/layer/x86/deformableconv2d_pack1to8.h new file mode 100644 index 00000000000..f607972fa21 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack1to8.h @@ -0,0 +1,193 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack1to8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 1; + const int out_elempack = 8; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m256 _sum = _mm256_setzero_ps(); + if (bias_data_ptr) + _sum = _mm256_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val_channel0 = _mm256_loadu_ps(zeros_ptr); + if (cond) + { + __m256 _v1_channel0 = _val_channel0; + __m256 _v2_channel0 = _val_channel0; + __m256 _v3_channel0 = _val_channel0; + __m256 _v4_channel0 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm256_broadcast_ss(data_im_ptr + v1_pos); + } + if (v2_cond) + { + _v2_channel0 = _mm256_broadcast_ss(data_im_ptr + v2_pos); + } + if (v3_cond) + { + _v3_channel0 = _mm256_broadcast_ss(data_im_ptr + v3_pos); + } + if (v4_cond) + { + _v4_channel0 = _mm256_broadcast_ss(data_im_ptr + v4_pos); + } + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val_channel0 = _mm256_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val_channel0 = _mm256_mul_ps(_val_channel0, _mask); + } + __m256 _conv_w0 = _mm256_load_ps(kptr); + _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx(_sum, activation_type, activation_params); + _mm256_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack4.h b/src/layer/x86/deformableconv2d_pack4.h new file mode 100644 index 00000000000..d6710fdba0e --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack4.h @@ -0,0 +1,241 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack4_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 4; + const int out_elempack = 4; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m128 _sum = _mm_setzero_ps(); + if (bias_data_ptr) + _sum = _mm_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val_channel0 = _mm_loadu_ps(zeros_ptr); + __m128 _val_channel1 = _val_channel0; + __m128 _val_channel2 = _val_channel0; + __m128 _val_channel3 = _val_channel0; + if (cond) + { + __m128 _v1_channel0 = _val_channel0; + __m128 _v1_channel1 = _val_channel0; + __m128 _v1_channel2 = _val_channel0; + __m128 _v1_channel3 = _val_channel0; + __m128 _v2_channel0 = _val_channel0; + __m128 _v2_channel1 = _val_channel0; + __m128 _v2_channel2 = _val_channel0; + __m128 _v2_channel3 = _val_channel0; + __m128 _v3_channel0 = _val_channel0; + __m128 _v3_channel1 = _val_channel0; + __m128 _v3_channel2 = _val_channel0; + __m128 _v3_channel3 = _val_channel0; + __m128 _v4_channel0 = _val_channel0; + __m128 _v4_channel1 = _val_channel0; + __m128 _v4_channel2 = _val_channel0; + __m128 _v4_channel3 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm_load1_ps(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 3); + } + if (v2_cond) + { + _v2_channel0 = _mm_load1_ps(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 3); + } + if (v3_cond) + { + _v3_channel0 = _mm_load1_ps(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 3); + } + if (v4_cond) + { + _v4_channel0 = _mm_load1_ps(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 3); + } + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val_channel0 = _mm_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val_channel0 = _mm_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm_mul_ps(_val_channel3, _mask); + } + __m128 _conv_w0 = _mm_load_ps(kptr); + __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m128 _conv_w2 = _mm_load_ps(kptr + 8); // 2 * out_elempack + __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + kptr += wstep; + } + } + } + _sum = activation_sse(_sum, activation_type, activation_params); + _mm_storeu_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack4to1.h b/src/layer/x86/deformableconv2d_pack4to1.h new file mode 100644 index 00000000000..597fe5e17a7 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack4to1.h @@ -0,0 +1,209 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack4to1_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 4; + const int out_elempack = 1; + const int wstep = out_elempack * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + float _sum = 0.f; + if (bias_data_ptr) + _sum = *(bias_data_ptr + oc); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + float _val_channel0 = 0.f; + float _val_channel1 = _val_channel0; + float _val_channel2 = _val_channel0; + float _val_channel3 = _val_channel0; + if (cond) + { + float _v1_channel0 = _val_channel0; + float _v1_channel1 = _val_channel0; + float _v1_channel2 = _val_channel0; + float _v1_channel3 = _val_channel0; + float _v2_channel0 = _val_channel0; + float _v2_channel1 = _val_channel0; + float _v2_channel2 = _val_channel0; + float _v2_channel3 = _val_channel0; + float _v3_channel0 = _val_channel0; + float _v3_channel1 = _val_channel0; + float _v3_channel2 = _val_channel0; + float _v3_channel3 = _val_channel0; + float _v4_channel0 = _val_channel0; + float _v4_channel1 = _val_channel0; + float _v4_channel2 = _val_channel0; + float _v4_channel3 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = *(data_im_ptr + v1_pos * elempack); + _v1_channel1 = *(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = *(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = *(data_im_ptr + v1_pos * elempack + 3); + } + if (v2_cond) + { + _v2_channel0 = *(data_im_ptr + v2_pos * elempack); + _v2_channel1 = *(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = *(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = *(data_im_ptr + v2_pos * elempack + 3); + } + if (v3_cond) + { + _v3_channel0 = *(data_im_ptr + v3_pos * elempack); + _v3_channel1 = *(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = *(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = *(data_im_ptr + v3_pos * elempack + 3); + } + if (v4_cond) + { + _v4_channel0 = *(data_im_ptr + v4_pos * elempack); + _v4_channel1 = *(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = *(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = *(data_im_ptr + v4_pos * elempack + 3); + } + _val_channel0 = w1 * _v1_channel0 + w2 * _v2_channel0 + w3 * _v3_channel0 + w4 * _v4_channel0; + _val_channel1 = w1 * _v1_channel1 + w2 * _v2_channel1 + w3 * _v3_channel1 + w4 * _v4_channel1; + _val_channel2 = w1 * _v1_channel2 + w2 * _v2_channel2 + w3 * _v3_channel2 + w4 * _v4_channel2; + _val_channel3 = w1 * _v1_channel3 + w2 * _v2_channel3 + w3 * _v3_channel3 + w4 * _v4_channel3; + } + if (has_mask) + { + _val_channel0 *= mask_; + _val_channel1 *= mask_; + _val_channel2 *= mask_; + _val_channel3 *= mask_; + } + float _conv_w0 = *(kptr); + float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack + float _conv_w2 = *(kptr + 2); // 2 * out_elempack + float _conv_w3 = *(kptr + 3); // 3 * out_elempack + _sum += (_val_channel0 * _conv_w0 + _val_channel1 * _conv_w1 + _val_channel2 * _conv_w2 + _val_channel3 * _conv_w3); + kptr += wstep; + } + } + } + _sum = activation_ss(_sum, activation_type, activation_params); + *(outptr + h_col * outw + w_col) = _sum; + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack4to16.h b/src/layer/x86/deformableconv2d_pack4to16.h new file mode 100644 index 00000000000..74dc663962b --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack4to16.h @@ -0,0 +1,241 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack4to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 4; + const int out_elempack = 16; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m512 _sum = _mm512_setzero_ps(); + if (bias_data_ptr) + _sum = _mm512_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val_channel0 = _mm512_loadu_ps(zeros_ptr); + __m512 _val_channel1 = _val_channel0; + __m512 _val_channel2 = _val_channel0; + __m512 _val_channel3 = _val_channel0; + if (cond) + { + __m512 _v1_channel0 = _val_channel0; + __m512 _v1_channel1 = _val_channel0; + __m512 _v1_channel2 = _val_channel0; + __m512 _v1_channel3 = _val_channel0; + __m512 _v2_channel0 = _val_channel0; + __m512 _v2_channel1 = _val_channel0; + __m512 _v2_channel2 = _val_channel0; + __m512 _v2_channel3 = _val_channel0; + __m512 _v3_channel0 = _val_channel0; + __m512 _v3_channel1 = _val_channel0; + __m512 _v3_channel2 = _val_channel0; + __m512 _v3_channel3 = _val_channel0; + __m512 _v4_channel0 = _val_channel0; + __m512 _v4_channel1 = _val_channel0; + __m512 _v4_channel2 = _val_channel0; + __m512 _v4_channel3 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack]); + _v1_channel1 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 1]); + _v1_channel2 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 2]); + _v1_channel3 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 3]); + } + if (v2_cond) + { + _v2_channel0 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack]); + _v2_channel1 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 1]); + _v2_channel2 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 2]); + _v2_channel3 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 3]); + } + if (v3_cond) + { + _v3_channel0 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack]); + _v3_channel1 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 1]); + _v3_channel2 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 2]); + _v3_channel3 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 3]); + } + if (v4_cond) + { + _v4_channel0 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack]); + _v4_channel1 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 1]); + _v4_channel2 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 2]); + _v4_channel3 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 3]); + } + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val_channel0 = _mm512_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm512_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm512_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm512_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v4_channel3, _w4, _val_channel3); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val_channel0 = _mm512_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm512_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm512_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm512_mul_ps(_val_channel3, _mask); + } + __m512 _conv_w0 = _mm512_load_ps(kptr); + __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm512_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack + __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm512_fmadd_ps(_val_channel3, _conv_w3, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx512(_sum, activation_type, activation_params); + _mm512_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack4to8.h b/src/layer/x86/deformableconv2d_pack4to8.h new file mode 100644 index 00000000000..fc5830dce57 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack4to8.h @@ -0,0 +1,241 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack4to8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 4; + const int out_elempack = 8; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m256 _sum = _mm256_setzero_ps(); + if (bias_data_ptr) + _sum = _mm256_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val_channel0 = _mm256_loadu_ps(zeros_ptr); + __m256 _val_channel1 = _val_channel0; + __m256 _val_channel2 = _val_channel0; + __m256 _val_channel3 = _val_channel0; + if (cond) + { + __m256 _v1_channel0 = _val_channel0; + __m256 _v1_channel1 = _val_channel0; + __m256 _v1_channel2 = _val_channel0; + __m256 _v1_channel3 = _val_channel0; + __m256 _v2_channel0 = _val_channel0; + __m256 _v2_channel1 = _val_channel0; + __m256 _v2_channel2 = _val_channel0; + __m256 _v2_channel3 = _val_channel0; + __m256 _v3_channel0 = _val_channel0; + __m256 _v3_channel1 = _val_channel0; + __m256 _v3_channel2 = _val_channel0; + __m256 _v3_channel3 = _val_channel0; + __m256 _v4_channel0 = _val_channel0; + __m256 _v4_channel1 = _val_channel0; + __m256 _v4_channel2 = _val_channel0; + __m256 _v4_channel3 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 3); + } + if (v2_cond) + { + _v2_channel0 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 3); + } + if (v3_cond) + { + _v3_channel0 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 3); + } + if (v4_cond) + { + _v4_channel0 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 3); + } + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val_channel0 = _mm256_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm256_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm256_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm256_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val_channel0 = _mm256_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm256_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm256_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm256_mul_ps(_val_channel3, _mask); + } + __m256 _conv_w0 = _mm256_load_ps(kptr); + __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack + __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx(_sum, activation_type, activation_params); + _mm256_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack8.h b/src/layer/x86/deformableconv2d_pack8.h new file mode 100644 index 00000000000..696e4550818 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack8.h @@ -0,0 +1,305 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 8; + const int out_elempack = 8; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m256 _sum = _mm256_setzero_ps(); + if (bias_data_ptr) + _sum = _mm256_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val_channel0 = _mm256_loadu_ps(zeros_ptr); + __m256 _val_channel1 = _val_channel0; + __m256 _val_channel2 = _val_channel0; + __m256 _val_channel3 = _val_channel0; + __m256 _val_channel4 = _val_channel0; + __m256 _val_channel5 = _val_channel0; + __m256 _val_channel6 = _val_channel0; + __m256 _val_channel7 = _val_channel0; + if (cond) + { + __m256 _v1_channel0 = _val_channel0; + __m256 _v1_channel1 = _val_channel0; + __m256 _v1_channel2 = _val_channel0; + __m256 _v1_channel3 = _val_channel0; + __m256 _v1_channel4 = _val_channel0; + __m256 _v1_channel5 = _val_channel0; + __m256 _v1_channel6 = _val_channel0; + __m256 _v1_channel7 = _val_channel0; + __m256 _v2_channel0 = _val_channel0; + __m256 _v2_channel1 = _val_channel0; + __m256 _v2_channel2 = _val_channel0; + __m256 _v2_channel3 = _val_channel0; + __m256 _v2_channel4 = _val_channel0; + __m256 _v2_channel5 = _val_channel0; + __m256 _v2_channel6 = _val_channel0; + __m256 _v2_channel7 = _val_channel0; + __m256 _v3_channel0 = _val_channel0; + __m256 _v3_channel1 = _val_channel0; + __m256 _v3_channel2 = _val_channel0; + __m256 _v3_channel3 = _val_channel0; + __m256 _v3_channel4 = _val_channel0; + __m256 _v3_channel5 = _val_channel0; + __m256 _v3_channel6 = _val_channel0; + __m256 _v3_channel7 = _val_channel0; + __m256 _v4_channel0 = _val_channel0; + __m256 _v4_channel1 = _val_channel0; + __m256 _v4_channel2 = _val_channel0; + __m256 _v4_channel3 = _val_channel0; + __m256 _v4_channel4 = _val_channel0; + __m256 _v4_channel5 = _val_channel0; + __m256 _v4_channel6 = _val_channel0; + __m256 _v4_channel7 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 7); + } + if (v2_cond) + { + _v2_channel0 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 7); + } + if (v3_cond) + { + _v3_channel0 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 7); + } + if (v4_cond) + { + _v4_channel0 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 7); + } + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val_channel0 = _mm256_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm256_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm256_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm256_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm256_comp_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm256_comp_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm256_comp_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm256_comp_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v4_channel7, _w4, _val_channel7); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val_channel0 = _mm256_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm256_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm256_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm256_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm256_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm256_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm256_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm256_mul_ps(_val_channel7, _mask); + } + __m256 _conv_w0 = _mm256_load_ps(kptr); + __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack + __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m256 _conv_w4 = _mm256_load_ps(kptr + 32); // 4 * out_elempack + __m256 _conv_w5 = _mm256_load_ps(kptr + 40); // 5 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m256 _conv_w6 = _mm256_load_ps(kptr + 48); // 6 * out_elempack + __m256 _conv_w7 = _mm256_load_ps(kptr + 56); // 7 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel7, _conv_w7, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx(_sum, activation_type, activation_params); + _mm256_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack8to1.h b/src/layer/x86/deformableconv2d_pack8to1.h new file mode 100644 index 00000000000..17a01c7b3bf --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack8to1.h @@ -0,0 +1,257 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack8to1_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 8; + const int out_elempack = 1; + const int wstep = out_elempack * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + float _sum = 0.f; + if (bias_data_ptr) + _sum = *(bias_data_ptr + oc); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + float _val_channel0 = 0.f; + float _val_channel1 = _val_channel0; + float _val_channel2 = _val_channel0; + float _val_channel3 = _val_channel0; + float _val_channel4 = _val_channel0; + float _val_channel5 = _val_channel0; + float _val_channel6 = _val_channel0; + float _val_channel7 = _val_channel0; + if (cond) + { + float _v1_channel0 = _val_channel0; + float _v1_channel1 = _val_channel0; + float _v1_channel2 = _val_channel0; + float _v1_channel3 = _val_channel0; + float _v1_channel4 = _val_channel0; + float _v1_channel5 = _val_channel0; + float _v1_channel6 = _val_channel0; + float _v1_channel7 = _val_channel0; + float _v2_channel0 = _val_channel0; + float _v2_channel1 = _val_channel0; + float _v2_channel2 = _val_channel0; + float _v2_channel3 = _val_channel0; + float _v2_channel4 = _val_channel0; + float _v2_channel5 = _val_channel0; + float _v2_channel6 = _val_channel0; + float _v2_channel7 = _val_channel0; + float _v3_channel0 = _val_channel0; + float _v3_channel1 = _val_channel0; + float _v3_channel2 = _val_channel0; + float _v3_channel3 = _val_channel0; + float _v3_channel4 = _val_channel0; + float _v3_channel5 = _val_channel0; + float _v3_channel6 = _val_channel0; + float _v3_channel7 = _val_channel0; + float _v4_channel0 = _val_channel0; + float _v4_channel1 = _val_channel0; + float _v4_channel2 = _val_channel0; + float _v4_channel3 = _val_channel0; + float _v4_channel4 = _val_channel0; + float _v4_channel5 = _val_channel0; + float _v4_channel6 = _val_channel0; + float _v4_channel7 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = *(data_im_ptr + v1_pos * elempack); + _v1_channel1 = *(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = *(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = *(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = *(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = *(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = *(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = *(data_im_ptr + v1_pos * elempack + 7); + } + if (v2_cond) + { + _v2_channel0 = *(data_im_ptr + v2_pos * elempack); + _v2_channel1 = *(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = *(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = *(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = *(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = *(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = *(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = *(data_im_ptr + v2_pos * elempack + 7); + } + if (v3_cond) + { + _v3_channel0 = *(data_im_ptr + v3_pos * elempack); + _v3_channel1 = *(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = *(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = *(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = *(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = *(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = *(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = *(data_im_ptr + v3_pos * elempack + 7); + } + if (v4_cond) + { + _v4_channel0 = *(data_im_ptr + v4_pos * elempack); + _v4_channel1 = *(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = *(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = *(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = *(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = *(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = *(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = *(data_im_ptr + v4_pos * elempack + 7); + } + _val_channel0 = w1 * _v1_channel0 + w2 * _v2_channel0 + w3 * _v3_channel0 + w4 * _v4_channel0; + _val_channel1 = w1 * _v1_channel1 + w2 * _v2_channel1 + w3 * _v3_channel1 + w4 * _v4_channel1; + _val_channel2 = w1 * _v1_channel2 + w2 * _v2_channel2 + w3 * _v3_channel2 + w4 * _v4_channel2; + _val_channel3 = w1 * _v1_channel3 + w2 * _v2_channel3 + w3 * _v3_channel3 + w4 * _v4_channel3; + _val_channel4 = w1 * _v1_channel4 + w2 * _v2_channel4 + w3 * _v3_channel4 + w4 * _v4_channel4; + _val_channel5 = w1 * _v1_channel5 + w2 * _v2_channel5 + w3 * _v3_channel5 + w4 * _v4_channel5; + _val_channel6 = w1 * _v1_channel6 + w2 * _v2_channel6 + w3 * _v3_channel6 + w4 * _v4_channel6; + _val_channel7 = w1 * _v1_channel7 + w2 * _v2_channel7 + w3 * _v3_channel7 + w4 * _v4_channel7; + } + if (has_mask) + { + _val_channel0 *= mask_; + _val_channel1 *= mask_; + _val_channel2 *= mask_; + _val_channel3 *= mask_; + _val_channel4 *= mask_; + _val_channel5 *= mask_; + _val_channel6 *= mask_; + _val_channel7 *= mask_; + } + float _conv_w0 = *(kptr); + float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack + float _conv_w2 = *(kptr + 2); // 2 * out_elempack + float _conv_w3 = *(kptr + 3); // 3 * out_elempack + float _conv_w4 = *(kptr + 4); // 4 * out_elempack + float _conv_w5 = *(kptr + 5); // 5 * out_elempack + float _conv_w6 = *(kptr + 6); // 6 * out_elempack + float _conv_w7 = *(kptr + 7); // 7 * out_elempack + _sum += (_val_channel0 * _conv_w0 + _val_channel1 * _conv_w1 + _val_channel2 * _conv_w2 + _val_channel3 * _conv_w3 + _val_channel4 * _conv_w4 + _val_channel5 * _conv_w5 + _val_channel6 * _conv_w6 + _val_channel7 * _conv_w7); + kptr += wstep; + } + } + } + _sum = activation_ss(_sum, activation_type, activation_params); + *(outptr + h_col * outw + w_col) = _sum; + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack8to16.h b/src/layer/x86/deformableconv2d_pack8to16.h new file mode 100644 index 00000000000..1c77ac8f6a9 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack8to16.h @@ -0,0 +1,305 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack8to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 8; + const int out_elempack = 16; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m512 _sum = _mm512_setzero_ps(); + if (bias_data_ptr) + _sum = _mm512_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val_channel0 = _mm512_loadu_ps(zeros_ptr); + __m512 _val_channel1 = _val_channel0; + __m512 _val_channel2 = _val_channel0; + __m512 _val_channel3 = _val_channel0; + __m512 _val_channel4 = _val_channel0; + __m512 _val_channel5 = _val_channel0; + __m512 _val_channel6 = _val_channel0; + __m512 _val_channel7 = _val_channel0; + if (cond) + { + __m512 _v1_channel0 = _val_channel0; + __m512 _v1_channel1 = _val_channel0; + __m512 _v1_channel2 = _val_channel0; + __m512 _v1_channel3 = _val_channel0; + __m512 _v1_channel4 = _val_channel0; + __m512 _v1_channel5 = _val_channel0; + __m512 _v1_channel6 = _val_channel0; + __m512 _v1_channel7 = _val_channel0; + __m512 _v2_channel0 = _val_channel0; + __m512 _v2_channel1 = _val_channel0; + __m512 _v2_channel2 = _val_channel0; + __m512 _v2_channel3 = _val_channel0; + __m512 _v2_channel4 = _val_channel0; + __m512 _v2_channel5 = _val_channel0; + __m512 _v2_channel6 = _val_channel0; + __m512 _v2_channel7 = _val_channel0; + __m512 _v3_channel0 = _val_channel0; + __m512 _v3_channel1 = _val_channel0; + __m512 _v3_channel2 = _val_channel0; + __m512 _v3_channel3 = _val_channel0; + __m512 _v3_channel4 = _val_channel0; + __m512 _v3_channel5 = _val_channel0; + __m512 _v3_channel6 = _val_channel0; + __m512 _v3_channel7 = _val_channel0; + __m512 _v4_channel0 = _val_channel0; + __m512 _v4_channel1 = _val_channel0; + __m512 _v4_channel2 = _val_channel0; + __m512 _v4_channel3 = _val_channel0; + __m512 _v4_channel4 = _val_channel0; + __m512 _v4_channel5 = _val_channel0; + __m512 _v4_channel6 = _val_channel0; + __m512 _v4_channel7 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack]); + _v1_channel1 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 1]); + _v1_channel2 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 2]); + _v1_channel3 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 3]); + _v1_channel4 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 4]); + _v1_channel5 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 5]); + _v1_channel6 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 6]); + _v1_channel7 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 7]); + } + if (v2_cond) + { + _v2_channel0 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack]); + _v2_channel1 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 1]); + _v2_channel2 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 2]); + _v2_channel3 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 3]); + _v2_channel4 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 4]); + _v2_channel5 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 5]); + _v2_channel6 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 6]); + _v2_channel7 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 7]); + } + if (v3_cond) + { + _v3_channel0 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack]); + _v3_channel1 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 1]); + _v3_channel2 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 2]); + _v3_channel3 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 3]); + _v3_channel4 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 4]); + _v3_channel5 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 5]); + _v3_channel6 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 6]); + _v3_channel7 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 7]); + } + if (v4_cond) + { + _v4_channel0 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack]); + _v4_channel1 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 1]); + _v4_channel2 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 2]); + _v4_channel3 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 3]); + _v4_channel4 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 4]); + _v4_channel5 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 5]); + _v4_channel6 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 6]); + _v4_channel7 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 7]); + } + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val_channel0 = _mm512_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm512_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm512_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm512_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm512_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm512_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm512_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm512_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v4_channel7, _w4, _val_channel7); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val_channel0 = _mm512_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm512_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm512_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm512_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm512_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm512_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm512_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm512_mul_ps(_val_channel7, _mask); + } + __m512 _conv_w0 = _mm512_load_ps(kptr); + __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm512_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack + __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm512_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m512 _conv_w4 = _mm512_load_ps(kptr + 64); // 4 * out_elempack + __m512 _conv_w5 = _mm512_load_ps(kptr + 80); // 5 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm512_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m512 _conv_w6 = _mm512_load_ps(kptr + 96); // 6 * out_elempack + __m512 _conv_w7 = _mm512_load_ps(kptr + 112); // 7 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm512_fmadd_ps(_val_channel7, _conv_w7, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx512(_sum, activation_type, activation_params); + _mm512_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack8to4.h b/src/layer/x86/deformableconv2d_pack8to4.h new file mode 100644 index 00000000000..0c905d8ff62 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack8to4.h @@ -0,0 +1,305 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack8to4_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 8; + const int out_elempack = 4; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m128 _sum = _mm_setzero_ps(); + if (bias_data_ptr) + _sum = _mm_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val_channel0 = _mm_loadu_ps(zeros_ptr); + __m128 _val_channel1 = _val_channel0; + __m128 _val_channel2 = _val_channel0; + __m128 _val_channel3 = _val_channel0; + __m128 _val_channel4 = _val_channel0; + __m128 _val_channel5 = _val_channel0; + __m128 _val_channel6 = _val_channel0; + __m128 _val_channel7 = _val_channel0; + if (cond) + { + __m128 _v1_channel0 = _val_channel0; + __m128 _v1_channel1 = _val_channel0; + __m128 _v1_channel2 = _val_channel0; + __m128 _v1_channel3 = _val_channel0; + __m128 _v1_channel4 = _val_channel0; + __m128 _v1_channel5 = _val_channel0; + __m128 _v1_channel6 = _val_channel0; + __m128 _v1_channel7 = _val_channel0; + __m128 _v2_channel0 = _val_channel0; + __m128 _v2_channel1 = _val_channel0; + __m128 _v2_channel2 = _val_channel0; + __m128 _v2_channel3 = _val_channel0; + __m128 _v2_channel4 = _val_channel0; + __m128 _v2_channel5 = _val_channel0; + __m128 _v2_channel6 = _val_channel0; + __m128 _v2_channel7 = _val_channel0; + __m128 _v3_channel0 = _val_channel0; + __m128 _v3_channel1 = _val_channel0; + __m128 _v3_channel2 = _val_channel0; + __m128 _v3_channel3 = _val_channel0; + __m128 _v3_channel4 = _val_channel0; + __m128 _v3_channel5 = _val_channel0; + __m128 _v3_channel6 = _val_channel0; + __m128 _v3_channel7 = _val_channel0; + __m128 _v4_channel0 = _val_channel0; + __m128 _v4_channel1 = _val_channel0; + __m128 _v4_channel2 = _val_channel0; + __m128 _v4_channel3 = _val_channel0; + __m128 _v4_channel4 = _val_channel0; + __m128 _v4_channel5 = _val_channel0; + __m128 _v4_channel6 = _val_channel0; + __m128 _v4_channel7 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm_load1_ps(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 7); + } + if (v2_cond) + { + _v2_channel0 = _mm_load1_ps(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 7); + } + if (v3_cond) + { + _v3_channel0 = _mm_load1_ps(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 7); + } + if (v4_cond) + { + _v4_channel0 = _mm_load1_ps(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 7); + } + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val_channel0 = _mm_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm_comp_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm_comp_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm_comp_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm_comp_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v4_channel7, _w4, _val_channel7); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val_channel0 = _mm_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm_mul_ps(_val_channel7, _mask); + } + __m128 _conv_w0 = _mm_load_ps(kptr); + __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m128 _conv_w2 = _mm_load_ps(kptr + 8); // 2 * out_elempack + __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m128 _conv_w4 = _mm_load_ps(kptr + 16); // 4 * out_elempack + __m128 _conv_w5 = _mm_load_ps(kptr + 20); // 5 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m128 _conv_w6 = _mm_load_ps(kptr + 24); // 6 * out_elempack + __m128 _conv_w7 = _mm_load_ps(kptr + 28); // 7 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel7, _conv_w7, _sum); + kptr += wstep; + } + } + } + _sum = activation_sse(_sum, activation_type, activation_params); + _mm_storeu_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_sgemm.h b/src/layer/x86/deformableconv2d_sgemm.h new file mode 100644 index 00000000000..7efaa50ec65 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm.h @@ -0,0 +1,134 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm.h" + +static void deformableconv2d_im2col_sgemm_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + int h_low = 0; + int w_low = 0; + int h_high = 0; + int w_high = 0; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + if (cond) + { + h_low = floor(h_im); + w_low = floor(w_im); + h_high = h_low + 1; + w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + for (int ic = 0; ic < inch; ic++) + { + float val = 0.f; + if (cond) + { + float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f; + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + bottom_im2col.channel(ic).row(i * kernel_w + j)[h_col * outw + w_col] = val * mask_; + } + } + } + } + } + } + + im2col_sgemm_sse(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16.h b/src/layer/x86/deformableconv2d_sgemm_pack16.h new file mode 100644 index 00000000000..d633b1d2eb8 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack16.h @@ -0,0 +1,178 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack16.h" + +static void deformableconv2d_im2col_sgemm_pack16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 16; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val = _mm512_loadu_ps(zeros_ptr); + if (cond) + { + __m512 _v1 = _val; + __m512 _v2 = _val; + __m512 _v3 = _val; + __m512 _v4 = _val; + if (v1_cond) + _v1 = _mm512_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm512_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm512_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm512_load_ps(data_im_ptr + v4_pos * elempack); + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val = _mm512_fmadd_ps(_v1, _w1, _val); + _val = _mm512_fmadd_ps(_v2, _w2, _val); + _val = _mm512_fmadd_ps(_v3, _w3, _val); + _val = _mm512_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val = _mm512_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm512_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack16_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16to1.h b/src/layer/x86/deformableconv2d_sgemm_pack16to1.h new file mode 100644 index 00000000000..9b341a6c44c --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack16to1.h @@ -0,0 +1,178 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack16to1.h" + +static void deformableconv2d_im2col_sgemm_pack16to1_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 16; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val = _mm512_loadu_ps(zeros_ptr); + if (cond) + { + __m512 _v1 = _val; + __m512 _v2 = _val; + __m512 _v3 = _val; + __m512 _v4 = _val; + if (v1_cond) + _v1 = _mm512_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm512_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm512_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm512_load_ps(data_im_ptr + v4_pos * elempack); + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val = _mm512_fmadd_ps(_v1, _w1, _val); + _val = _mm512_fmadd_ps(_v2, _w2, _val); + _val = _mm512_fmadd_ps(_v3, _w3, _val); + _val = _mm512_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val = _mm512_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm512_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack16to1_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16to4.h b/src/layer/x86/deformableconv2d_sgemm_pack16to4.h new file mode 100644 index 00000000000..bd609161ca3 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack16to4.h @@ -0,0 +1,178 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack16to4.h" + +static void deformableconv2d_im2col_sgemm_pack16to4_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 16; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val = _mm512_loadu_ps(zeros_ptr); + if (cond) + { + __m512 _v1 = _val; + __m512 _v2 = _val; + __m512 _v3 = _val; + __m512 _v4 = _val; + if (v1_cond) + _v1 = _mm512_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm512_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm512_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm512_load_ps(data_im_ptr + v4_pos * elempack); + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val = _mm512_fmadd_ps(_v1, _w1, _val); + _val = _mm512_fmadd_ps(_v2, _w2, _val); + _val = _mm512_fmadd_ps(_v3, _w3, _val); + _val = _mm512_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val = _mm512_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm512_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack16to4_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16to8.h b/src/layer/x86/deformableconv2d_sgemm_pack16to8.h new file mode 100644 index 00000000000..e966a2860cf --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack16to8.h @@ -0,0 +1,178 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack16to8.h" + +static void deformableconv2d_im2col_sgemm_pack16to8_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 16; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val = _mm512_loadu_ps(zeros_ptr); + if (cond) + { + __m512 _v1 = _val; + __m512 _v2 = _val; + __m512 _v3 = _val; + __m512 _v4 = _val; + if (v1_cond) + _v1 = _mm512_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm512_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm512_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm512_load_ps(data_im_ptr + v4_pos * elempack); + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val = _mm512_fmadd_ps(_v1, _w1, _val); + _val = _mm512_fmadd_ps(_v2, _w2, _val); + _val = _mm512_fmadd_ps(_v3, _w3, _val); + _val = _mm512_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val = _mm512_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm512_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack16to8_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack1to16.h b/src/layer/x86/deformableconv2d_sgemm_pack1to16.h new file mode 100644 index 00000000000..d3cfcf62c3d --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack1to16.h @@ -0,0 +1,134 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack1to16.h" + +static void deformableconv2d_im2col_sgemm_pack1to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + int h_low = 0; + int w_low = 0; + int h_high = 0; + int w_high = 0; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + if (cond) + { + h_low = floor(h_im); + w_low = floor(w_im); + h_high = h_low + 1; + w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + for (int ic = 0; ic < inch; ic++) + { + float val = 0.f; + if (cond) + { + float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f; + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + bottom_im2col.channel(ic).row(i * kernel_w + j)[h_col * outw + w_col] = val * mask_; + } + } + } + } + } + } + + im2col_sgemm_pack1to16_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack1to4.h b/src/layer/x86/deformableconv2d_sgemm_pack1to4.h new file mode 100644 index 00000000000..b314a86f006 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack1to4.h @@ -0,0 +1,134 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack1to4.h" + +static void deformableconv2d_im2col_sgemm_pack1to4_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + int h_low = 0; + int w_low = 0; + int h_high = 0; + int w_high = 0; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + if (cond) + { + h_low = floor(h_im); + w_low = floor(w_im); + h_high = h_low + 1; + w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + for (int ic = 0; ic < inch; ic++) + { + float val = 0.f; + if (cond) + { + float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f; + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + bottom_im2col.channel(ic).row(i * kernel_w + j)[h_col * outw + w_col] = val * mask_; + } + } + } + } + } + } + + im2col_sgemm_pack1to4_sse(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack1to8.h b/src/layer/x86/deformableconv2d_sgemm_pack1to8.h new file mode 100644 index 00000000000..ba5315f04aa --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack1to8.h @@ -0,0 +1,134 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack1to8.h" + +static void deformableconv2d_im2col_sgemm_pack1to8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + int h_low = 0; + int w_low = 0; + int h_high = 0; + int w_high = 0; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + if (cond) + { + h_low = floor(h_im); + w_low = floor(w_im); + h_high = h_low + 1; + w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + for (int ic = 0; ic < inch; ic++) + { + float val = 0.f; + if (cond) + { + float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f; + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + bottom_im2col.channel(ic).row(i * kernel_w + j)[h_col * outw + w_col] = val * mask_; + } + } + } + } + } + } + + im2col_sgemm_pack1to8_avx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4.h b/src/layer/x86/deformableconv2d_sgemm_pack4.h new file mode 100644 index 00000000000..794baab3415 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack4.h @@ -0,0 +1,178 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack4.h" + +static void deformableconv2d_im2col_sgemm_pack4_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 4; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val = _mm_loadu_ps(zeros_ptr); + if (cond) + { + __m128 _v1 = _val; + __m128 _v2 = _val; + __m128 _v3 = _val; + __m128 _v4 = _val; + if (v1_cond) + _v1 = _mm_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm_load_ps(data_im_ptr + v4_pos * elempack); + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val = _mm_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val = _mm_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack4_sse(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4to1.h b/src/layer/x86/deformableconv2d_sgemm_pack4to1.h new file mode 100644 index 00000000000..e30bb60c518 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack4to1.h @@ -0,0 +1,178 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack4to1.h" + +static void deformableconv2d_im2col_sgemm_pack4to1_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 4; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val = _mm_loadu_ps(zeros_ptr); + if (cond) + { + __m128 _v1 = _val; + __m128 _v2 = _val; + __m128 _v3 = _val; + __m128 _v4 = _val; + if (v1_cond) + _v1 = _mm_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm_load_ps(data_im_ptr + v4_pos * elempack); + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val = _mm_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val = _mm_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack4to1_sse(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4to16.h b/src/layer/x86/deformableconv2d_sgemm_pack4to16.h new file mode 100644 index 00000000000..ead34f1c6df --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack4to16.h @@ -0,0 +1,178 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack4to16.h" + +static void deformableconv2d_im2col_sgemm_pack4to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 4; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val = _mm_loadu_ps(zeros_ptr); + if (cond) + { + __m128 _v1 = _val; + __m128 _v2 = _val; + __m128 _v3 = _val; + __m128 _v4 = _val; + if (v1_cond) + _v1 = _mm_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm_load_ps(data_im_ptr + v4_pos * elempack); + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val = _mm_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val = _mm_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack4to16_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4to8.h b/src/layer/x86/deformableconv2d_sgemm_pack4to8.h new file mode 100644 index 00000000000..0c5ef0952c9 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack4to8.h @@ -0,0 +1,178 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack4to8.h" + +static void deformableconv2d_im2col_sgemm_pack4to8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 4; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val = _mm_loadu_ps(zeros_ptr); + if (cond) + { + __m128 _v1 = _val; + __m128 _v2 = _val; + __m128 _v3 = _val; + __m128 _v4 = _val; + if (v1_cond) + _v1 = _mm_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm_load_ps(data_im_ptr + v4_pos * elempack); + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val = _mm_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val = _mm_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack4to8_avx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8.h b/src/layer/x86/deformableconv2d_sgemm_pack8.h new file mode 100644 index 00000000000..9494cece461 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack8.h @@ -0,0 +1,178 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack8.h" + +static void deformableconv2d_im2col_sgemm_pack8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 8; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val = _mm256_loadu_ps(zeros_ptr); + if (cond) + { + __m256 _v1 = _val; + __m256 _v2 = _val; + __m256 _v3 = _val; + __m256 _v4 = _val; + if (v1_cond) + _v1 = _mm256_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm256_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm256_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm256_load_ps(data_im_ptr + v4_pos * elempack); + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val = _mm256_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm256_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm256_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm256_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val = _mm256_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm256_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack8_avx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8to1.h b/src/layer/x86/deformableconv2d_sgemm_pack8to1.h new file mode 100644 index 00000000000..0ce5558c328 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack8to1.h @@ -0,0 +1,178 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack8to1.h" + +static void deformableconv2d_im2col_sgemm_pack8to1_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 8; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val = _mm256_loadu_ps(zeros_ptr); + if (cond) + { + __m256 _v1 = _val; + __m256 _v2 = _val; + __m256 _v3 = _val; + __m256 _v4 = _val; + if (v1_cond) + _v1 = _mm256_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm256_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm256_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm256_load_ps(data_im_ptr + v4_pos * elempack); + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val = _mm256_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm256_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm256_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm256_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val = _mm256_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm256_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack8to1_avx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8to16.h b/src/layer/x86/deformableconv2d_sgemm_pack8to16.h new file mode 100644 index 00000000000..f32d0d5470f --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack8to16.h @@ -0,0 +1,178 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack8to16.h" + +static void deformableconv2d_im2col_sgemm_pack8to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 8; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val = _mm256_loadu_ps(zeros_ptr); + if (cond) + { + __m256 _v1 = _val; + __m256 _v2 = _val; + __m256 _v3 = _val; + __m256 _v4 = _val; + if (v1_cond) + _v1 = _mm256_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm256_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm256_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm256_load_ps(data_im_ptr + v4_pos * elempack); + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val = _mm256_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm256_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm256_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm256_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val = _mm256_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm256_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack8to16_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8to4.h b/src/layer/x86/deformableconv2d_sgemm_pack8to4.h new file mode 100644 index 00000000000..c94da45a4c7 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack8to4.h @@ -0,0 +1,178 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack8to4.h" + +static void deformableconv2d_im2col_sgemm_pack8to4_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 8; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val = _mm256_loadu_ps(zeros_ptr); + if (cond) + { + __m256 _v1 = _val; + __m256 _v2 = _val; + __m256 _v3 = _val; + __m256 _v4 = _val; + if (v1_cond) + _v1 = _mm256_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm256_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm256_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm256_load_ps(data_im_ptr + v4_pos * elempack); + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val = _mm256_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm256_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm256_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm256_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val = _mm256_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm256_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack8to4_avx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp index 869815283d9..722e8461820 100644 --- a/src/layer/x86/deformableconv2d_x86.cpp +++ b/src/layer/x86/deformableconv2d_x86.cpp @@ -14,63 +14,352 @@ #include "deformableconv2d_x86.h" +#if __SSE2__ +#include +#if __SSE4_1__ +#include +#if __AVX__ +#include +#endif +#endif // __SSE4_1__ +#endif // __SSE2__ +#include "x86_activation.h" +#include "x86_usability.h" + +#include "benchmark.h" +#include "cpu.h" #include "layer_type.h" namespace ncnn { +#include "deformableconv2d_sgemm.h" + +#if __SSE2__ +#include "deformableconv2d_pack4.h" +#include "deformableconv2d_pack1to4.h" +#include "deformableconv2d_pack4to1.h" + +#include "deformableconv2d_sgemm_pack4.h" +#include "deformableconv2d_sgemm_pack1to4.h" +#include "deformableconv2d_sgemm_pack4to1.h" + +#if __AVX__ +#include "deformableconv2d_pack8.h" +#include "deformableconv2d_pack4to8.h" +#include "deformableconv2d_pack1to8.h" +#include "deformableconv2d_pack8to4.h" +#include "deformableconv2d_pack8to1.h" + +#include "deformableconv2d_sgemm_pack8.h" +#include "deformableconv2d_sgemm_pack4to8.h" +#include "deformableconv2d_sgemm_pack1to8.h" +#include "deformableconv2d_sgemm_pack8to4.h" +#include "deformableconv2d_sgemm_pack8to1.h" + +#if __AVX512F__ +#include "deformableconv2d_pack16.h" +#include "deformableconv2d_pack8to16.h" +#include "deformableconv2d_pack4to16.h" +#include "deformableconv2d_pack1to16.h" +#include "deformableconv2d_pack16to8.h" +#include "deformableconv2d_pack16to4.h" +#include "deformableconv2d_pack16to1.h" + +#include "deformableconv2d_sgemm_pack16.h" +#include "deformableconv2d_sgemm_pack8to16.h" +#include "deformableconv2d_sgemm_pack4to16.h" +#include "deformableconv2d_sgemm_pack1to16.h" +#include "deformableconv2d_sgemm_pack16to8.h" +#include "deformableconv2d_sgemm_pack16to4.h" +#include "deformableconv2d_sgemm_pack16to1.h" +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ + DeformableConv2D_x86::DeformableConv2D_x86() { - one_blob_only = false; - support_inplace = false; +#if __SSE2__ + support_packing = true; +#endif // __SSE2__ - inner_product = 0; - permute = 0; + activation = 0; } -int DeformableConv2D_x86::create_pipeline(const Option& opt) +static int _4Dindex_to_1Dindex(int i0, int i1, int i2, int i3, int l1, int l2, int l3) +{ + return ((i0 * l1 + i1) * l2 + i2) * l3 + i3; +} + +static int _6Dindex_to_1Dindex(int i0, int i1, int i2, int i3, int i4, int i5, int l1, int l2, int l3, int l4, int l5) +{ + return ((((i0 * l1 + i1) * l2 + i2) * l3 + i3) * l4 + i4) * l5 + i5; +} + +static void deformableconv2d_transform_kernel_packed_sse(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) { - const int in_c = weight_data_size / (num_output * kernel_h * kernel_w); + const int maxk = kernel_w * kernel_h; + + // src = kw-kh-inch-outch + // dst = pb-pa-inch/pa-kw-kh-outch/pb { - Mat weight_3d = weight_data.reshape(kernel_w * kernel_h, in_c, num_output); - weight_data_t.create(in_c, kernel_w * kernel_h, num_output); - if (weight_data_t.empty()) - return -100; - for (int q = 0; q < num_output; q++) - { - const Mat m = weight_3d.channel(q); - float* outptr = weight_data_t.channel(q); + const float* weight_ptr = weight_data; - for (int i = 0; i < kernel_w * kernel_h; i++) + weight_data_tm.create(num_input * maxk * num_output / (elempack * out_elempack), (size_t)4u * elempack * out_elempack, elempack * out_elempack); + float* ptr = weight_data_tm; + for (int oc = 0; oc < num_output; oc++) + { + for (int i = 0; i < kernel_h; i++) { - for (int j = 0; j < in_c; j++) + for (int j = 0; j < kernel_w; j++) { - *outptr++ = m.row(j)[i]; + for (int ic = 0; ic < num_input; ic++) + { + ptr[_6Dindex_to_1Dindex(oc / out_elempack, i, j, ic / elempack, ic % elempack, oc % out_elempack, kernel_h, kernel_w, num_input / elempack, elempack, out_elempack)] = weight_ptr[_4Dindex_to_1Dindex(oc, ic, i, j, num_input, kernel_h, kernel_w)]; + } } } } - weight_3d.release(); - weight_data_t = weight_data_t.reshape(in_c * kernel_w * kernel_h, num_output); + weight_data_tm = weight_data_tm.reshape(num_input / elempack, maxk, num_output / out_elempack); + } +} + +int DeformableConv2D_x86::create_pipeline(const Option& opt) +{ + activation = create_activation_layer(activation_type, activation_params, opt); + + int kernel_size = kernel_w * kernel_h; + int num_input = weight_data_size / kernel_size / num_output; + + int elempack = 1; + int out_elempack = 1; + +#if __SSE2__ + if (opt.use_packing_layout) + { +#if __AVX512F__ + elempack = num_input % 16 == 0 ? 16 : num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1; + out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#elif __AVX__ + elempack = num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1; + out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#else + elempack = num_input % 4 == 0 ? 4 : 1; + out_elempack = num_output % 4 == 0 ? 4 : 1; +#endif + } +#endif // __SSE2__ + +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + if (elempack == 16 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack16_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 8 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack8to16_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 16 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack16to8_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 4 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack4to16_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 16 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack16to4_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 1 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack1to16_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 16 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack16to1_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + +#endif // __AVX512F__ + + // pack8 + if (elempack == 8 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack8_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack4to8 + if (elempack == 4 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack4to8_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack1to8 + if (elempack == 1 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack1to8_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack8to4 + if (elempack == 8 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack8to4_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack8to1 + if (elempack == 8 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack8to1_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } +#endif // __AVX__ + + // pack4 + if (elempack == 4 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack4_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack1to4 + if (elempack == 1 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack1to4_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack4to1 + if (elempack == 4 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack4to1_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } +#endif // __SSE2__ - inner_product = ncnn::create_layer(ncnn::LayerType::InnerProduct); - ncnn::ParamDict pd; - pd.set(0, num_output); - pd.set(1, bias_term); - pd.set(2, weight_data_size); - pd.set(9, activation_type); - pd.set(10, activation_params); - inner_product->load_param(pd); - ncnn::Mat weights[2]; - weights[0] = weight_data_t; - if (bias_term) - weights[1] = bias_data; - inner_product->load_model(ncnn::ModelBinFromMatArray(weights)); - inner_product->create_pipeline(opt); + // pack1 + if (elempack == 1 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + weight_data_tm = weight_data; + } + } - permute = ncnn::create_layer(ncnn::LayerType::Permute); - ncnn::ParamDict permute_pd; - permute_pd.set(0, 1); - permute->load_param(permute_pd); - permute->create_pipeline(opt); + if (opt.lightmode) + { + weight_data.release(); } return 0; @@ -78,17 +367,11 @@ int DeformableConv2D_x86::create_pipeline(const Option& opt) int DeformableConv2D_x86::destroy_pipeline(const Option& opt) { - if (inner_product) + if (activation) { - inner_product->destroy_pipeline(opt); - delete inner_product; - inner_product = 0; - } - if (permute) - { - permute->destroy_pipeline(opt); - delete permute; - permute = 0; + activation->destroy_pipeline(opt); + delete activation; + activation = 0; } return 0; @@ -98,134 +381,428 @@ int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vec { const Mat& bottom_blob = bottom_blobs[0]; const Mat& offset = bottom_blobs[1]; - const bool has_mask = (bottom_blobs.size() == 3); + Mat& top_blob = top_blobs[0]; - const int w = bottom_blob.w; - const int h = bottom_blob.h; - const int in_c = bottom_blob.c; - const size_t elemsize = bottom_blob.elemsize; + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + ppppp222(bottom_blob, "bottom_blob"); + ppppp222(offset, "offset"); + ppppp222(bottom_blobs[2], "mask"); const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int out_w = (w + pad_left + pad_right - kernel_extent_w) / stride_w + 1; const int out_h = (h + pad_top + pad_bottom - kernel_extent_h) / stride_h + 1; - // output = im2col matmul weight_t, im2col.shape is [out_h * out_w, kernel_h * kernel_w * in_c] (in python), - // weight_t.shape is [num_output, kernel_h * kernel_w * in_c] (in python), - // output.shape is [out_h * out_w, num_output] (in python). - Mat im2col; - im2col.create(kernel_h * kernel_w * in_c * out_h * out_w, elemsize, opt.blob_allocator); - if (im2col.empty()) - return -100; + int out_elempack = 1; +#if __SSE2__ + if (opt.use_packing_layout) + { +#if __AVX512F__ + out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#elif __AVX__ + out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#else + out_elempack = num_output % 4 == 0 ? 4 : 1; +#endif + } +#endif // __SSE2__ + size_t out_elemsize = elemsize / elempack * out_elempack; - Mat& output = top_blobs[0]; - output.create(num_output, out_h * out_w, elemsize, opt.blob_allocator); - if (output.empty()) + top_blob.create(out_w, out_h, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) return -100; - Mat bottom_blob_flatten = bottom_blob.reshape(w * h * in_c); - Mat offset_flatten = offset.reshape(offset.w * offset.h * offset.c); - const float* data_im_ptr = bottom_blob_flatten; - const float* data_offset_ptr = offset_flatten; - float* im2col_ptr = im2col; + const int num_input = channels * elempack; - // im2col - #pragma omp parallel for num_threads(opt.num_threads) - for (int h_col = 0; h_col < out_h; h_col++) +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + if (elempack == 16 && out_elempack == 16) { - for (int w_col = 0; w_col < out_w; w_col++) + if (opt.use_sgemm_convolution) { - int h_in = h_col * stride_h - pad_top; - int w_in = w_col * stride_w - pad_left; - float* data_col_ptr = im2col_ptr + (h_col * out_w + w_col) * kernel_h * kernel_w * in_c; - for (int i = 0; i < kernel_h; i++) + deformableconv2d_im2col_sgemm_pack16_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) { - for (int j = 0; j < kernel_w; j++) - { - const int data_offset_h_ptr = (((i * kernel_w + j) * 2) * out_h + h_col) * out_w + w_col; - const int data_offset_w_ptr = (((i * kernel_w + j) * 2 + 1) * out_h + h_col) * out_w + w_col; - - const float offset_h = data_offset_ptr[data_offset_h_ptr]; - const float offset_w = data_offset_ptr[data_offset_w_ptr]; - const float mask_ = has_mask ? bottom_blobs[2].channel(i * kernel_w + j).row(h_col)[w_col] : 1.f; - const float h_im = h_in + i * dilation_h + offset_h; - const float w_im = w_in + j * dilation_w + offset_w; - - // Bilinear - const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; - float w1 = 0.f; - float w2 = 0.f; - float w3 = 0.f; - float w4 = 0.f; - bool v1_cond = false; - bool v2_cond = false; - bool v3_cond = false; - bool v4_cond = false; - int v1_pos = 0; - int v2_pos = 0; - int v3_pos = 0; - int v4_pos = 0; - if (cond) - { - int h_low = floor(h_im); - int w_low = floor(w_im); - int h_high = h_low + 1; - int w_high = w_low + 1; - - float lh = h_im - h_low; - float lw = w_im - w_low; - float hh = 1 - lh; - float hw = 1 - lw; - - v1_cond = (h_low >= 0 && w_low >= 0); - v2_cond = (h_low >= 0 && w_high <= w - 1); - v3_cond = (h_high <= h - 1 && w_low >= 0); - v4_cond = (h_high <= h - 1 && w_high <= w - 1); - if (v1_cond) - v1_pos = h_low * w + w_low; - if (v2_cond) - v2_pos = h_low * w + w_high; - if (v3_cond) - v3_pos = h_high * w + w_low; - if (v4_cond) - v4_pos = h_high * w + w_high; - - w1 = hh * hw; - w2 = hh * lw; - w3 = lh * hw; - w4 = lh * lw; - } + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack16_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 8 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack8to16_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack8to16_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 16 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack16to8_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack16to8_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 4 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack4to16_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack4to16_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 16 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack16to4_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack16to4_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 1 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack1to16_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack1to16_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 16 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack16to1_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack16to1_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + +#endif // __AVX512F__ + + if (elempack == 8 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack8_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack8_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 1 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack1to8_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack1to8_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 4 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack4to8_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack4to8_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 8 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack8to1_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack8to1_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 8 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack8to4_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack8to4_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } +#endif // __AVX__ + + if (elempack == 4 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack4_sse(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack4_sse(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 1 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack1to4_sse(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack1to4_sse(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 4 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack4to1_sse(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack4to1_sse(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } +#endif // __SSE2__ + + if (elempack == 1 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_sse(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + const float* weight_ptr = weight_data_tm; - const float* data_im_channel_ptr = data_im_ptr; - for (int c_im = 0; c_im < in_c; c_im++) + // naive deformable conv + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < out_h; h_col++) + { + for (int w_col = 0; w_col < out_w; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < num_output; oc++) { - float val = 0.f; - if (cond) + float sum = 0.f; + if (bias_term) + sum = bias_data[oc]; + for (int i = 0; i < kernel_h; i++) { - float v1 = v1_cond ? data_im_channel_ptr[v1_pos] : 0.f; - float v2 = v2_cond ? data_im_channel_ptr[v2_pos] : 0.f; - float v3 = v3_cond ? data_im_channel_ptr[v3_pos] : 0.f; - float v4 = v4_cond ? data_im_channel_ptr[v4_pos] : 0.f; - val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + }else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + }else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + int h_low = 0; + int w_low = 0; + int h_high = 0; + int w_high = 0; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + if (cond) + { + h_low = floor(h_im); + w_low = floor(w_im); + h_high = h_low + 1; + w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + for (int ic = 0; ic < channels; ic++) + { + float val = 0.f; + if (cond) + { + float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f; + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + sum += val * mask_ * weight_ptr[((oc * channels + ic) * kernel_h + i) * kernel_w + j]; + } + } } - *data_col_ptr = val * mask_; - data_col_ptr += 1; - data_im_channel_ptr += h * w; + top_blob.channel(oc).row(h_col)[w_col] = activation_ss(sum, activation_type, activation_params); } } } } } - im2col = im2col.reshape(kernel_h * kernel_w * in_c, out_h * out_w); - // call InnerProduct - inner_product->forward(im2col, output, opt); - ncnn::Mat output_t; - // call Permute - permute->forward(output, output_t, opt); - output_t = output_t.reshape(out_w, out_h, num_output); - top_blobs[0] = output_t; + return 0; } diff --git a/src/layer/x86/deformableconv2d_x86.h b/src/layer/x86/deformableconv2d_x86.h index 0e21c9392af..a4f02f8fccb 100644 --- a/src/layer/x86/deformableconv2d_x86.h +++ b/src/layer/x86/deformableconv2d_x86.h @@ -30,10 +30,10 @@ class DeformableConv2D_x86 : virtual public DeformableConv2D virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; public: - Mat weight_data_t; + Layer* activation; - Layer* inner_product; - Layer* permute; + Mat weight_data_tm; + Mat weight_sgemm_data; }; } // namespace ncnn diff --git a/tests/test_deformableconv2d.cpp b/tests/test_deformableconv2d.cpp index 01511e54496..b62557df98c 100644 --- a/tests/test_deformableconv2d.cpp +++ b/tests/test_deformableconv2d.cpp @@ -93,7 +93,23 @@ static int test_deformableconv2d_0() || test_deformableconv2d(9, 7, 8, 4, k, d, s, p, 1) || test_deformableconv2d(9, 7, 8, 13, k, d, s, p, 0) || test_deformableconv2d(9, 7, 13, 8, k, d, s, p, 1) - || test_deformableconv2d(9, 7, 16, 16, k, d, s, p, 0); + || test_deformableconv2d(9, 7, 16, 16, k, d, s, p, 0) + || test_deformableconv2d(16, 16, 1 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 16 * 3, k, d, s, p, 1); if (ret != 0) return -1; From c23da01ff4668fd343d43eb7013c14b5aa0c9bb4 Mon Sep 17 00:00:00 2001 From: miemie2013 <53960695+miemie2013@users.noreply.github.com> Date: Wed, 10 Aug 2022 17:10:15 +0800 Subject: [PATCH 17/18] delete debug code. --- src/layer/x86/deformableconv2d_x86.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp index 722e8461820..88e9829aa8f 100644 --- a/src/layer/x86/deformableconv2d_x86.cpp +++ b/src/layer/x86/deformableconv2d_x86.cpp @@ -389,9 +389,6 @@ int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vec int channels = bottom_blob.c; size_t elemsize = bottom_blob.elemsize; int elempack = bottom_blob.elempack; - ppppp222(bottom_blob, "bottom_blob"); - ppppp222(offset, "offset"); - ppppp222(bottom_blobs[2], "mask"); const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; From ed947d42cf8b00cdb3d9d753f0d8a4ae7ff40301 Mon Sep 17 00:00:00 2001 From: miemie2013 Date: Wed, 10 Aug 2022 10:09:03 +0000 Subject: [PATCH 18/18] apply code-format changes --- src/layer/x86/deformableconv2d_pack16.h | 34 +++++++++--------- src/layer/x86/deformableconv2d_pack16to1.h | 36 ++++++++++--------- src/layer/x86/deformableconv2d_pack16to4.h | 34 +++++++++--------- src/layer/x86/deformableconv2d_pack16to8.h | 34 +++++++++--------- src/layer/x86/deformableconv2d_pack1to16.h | 6 ++-- src/layer/x86/deformableconv2d_pack1to4.h | 6 ++-- src/layer/x86/deformableconv2d_pack1to8.h | 6 ++-- src/layer/x86/deformableconv2d_pack4.h | 10 +++--- src/layer/x86/deformableconv2d_pack4to1.h | 12 ++++--- src/layer/x86/deformableconv2d_pack4to16.h | 12 ++++--- src/layer/x86/deformableconv2d_pack4to8.h | 12 ++++--- src/layer/x86/deformableconv2d_pack8.h | 20 ++++++----- src/layer/x86/deformableconv2d_pack8to1.h | 20 ++++++----- src/layer/x86/deformableconv2d_pack8to16.h | 18 +++++----- src/layer/x86/deformableconv2d_pack8to4.h | 18 +++++----- src/layer/x86/deformableconv2d_sgemm.h | 6 ++-- src/layer/x86/deformableconv2d_sgemm_pack16.h | 6 ++-- .../x86/deformableconv2d_sgemm_pack16to1.h | 6 ++-- .../x86/deformableconv2d_sgemm_pack16to4.h | 6 ++-- .../x86/deformableconv2d_sgemm_pack16to8.h | 6 ++-- .../x86/deformableconv2d_sgemm_pack1to16.h | 6 ++-- .../x86/deformableconv2d_sgemm_pack1to4.h | 6 ++-- .../x86/deformableconv2d_sgemm_pack1to8.h | 6 ++-- src/layer/x86/deformableconv2d_sgemm_pack4.h | 6 ++-- .../x86/deformableconv2d_sgemm_pack4to1.h | 6 ++-- .../x86/deformableconv2d_sgemm_pack4to16.h | 6 ++-- .../x86/deformableconv2d_sgemm_pack4to8.h | 6 ++-- src/layer/x86/deformableconv2d_sgemm_pack8.h | 6 ++-- .../x86/deformableconv2d_sgemm_pack8to1.h | 6 ++-- .../x86/deformableconv2d_sgemm_pack8to16.h | 6 ++-- .../x86/deformableconv2d_sgemm_pack8to4.h | 6 ++-- src/layer/x86/deformableconv2d_x86.cpp | 6 ++-- 32 files changed, 222 insertions(+), 158 deletions(-) diff --git a/src/layer/x86/deformableconv2d_pack16.h b/src/layer/x86/deformableconv2d_pack16.h index 7de08bc986d..42f260f6e96 100644 --- a/src/layer/x86/deformableconv2d_pack16.h +++ b/src/layer/x86/deformableconv2d_pack16.h @@ -62,7 +62,8 @@ static void deformableconv2d_pack16_avx512(const std::vector& bottom_blobs, { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -75,7 +76,8 @@ static void deformableconv2d_pack16_avx512(const std::vector& bottom_blobs, if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; @@ -390,35 +392,35 @@ static void deformableconv2d_pack16_avx512(const std::vector& bottom_blobs, _val_channelf = _mm512_mul_ps(_val_channelf, _mask); } __m512 _conv_w0 = _mm512_load_ps(kptr); - __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack + __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum); _sum = _mm512_fmadd_ps(_val_channel1, _conv_w1, _sum); - __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack - __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack + __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack + __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack _sum = _mm512_fmadd_ps(_val_channel2, _conv_w2, _sum); _sum = _mm512_fmadd_ps(_val_channel3, _conv_w3, _sum); - __m512 _conv_w4 = _mm512_load_ps(kptr + 64); // 4 * out_elempack - __m512 _conv_w5 = _mm512_load_ps(kptr + 80); // 5 * out_elempack + __m512 _conv_w4 = _mm512_load_ps(kptr + 64); // 4 * out_elempack + __m512 _conv_w5 = _mm512_load_ps(kptr + 80); // 5 * out_elempack _sum = _mm512_fmadd_ps(_val_channel4, _conv_w4, _sum); _sum = _mm512_fmadd_ps(_val_channel5, _conv_w5, _sum); __m512 _conv_w6 = _mm512_load_ps(kptr + 96); // 6 * out_elempack - __m512 _conv_w7 = _mm512_load_ps(kptr + 112); // 7 * out_elempack + __m512 _conv_w7 = _mm512_load_ps(kptr + 112); // 7 * out_elempack _sum = _mm512_fmadd_ps(_val_channel6, _conv_w6, _sum); _sum = _mm512_fmadd_ps(_val_channel7, _conv_w7, _sum); - __m512 _conv_w8 = _mm512_load_ps(kptr + 128); // 8 * out_elempack - __m512 _conv_w9 = _mm512_load_ps(kptr + 144); // 9 * out_elempack + __m512 _conv_w8 = _mm512_load_ps(kptr + 128); // 8 * out_elempack + __m512 _conv_w9 = _mm512_load_ps(kptr + 144); // 9 * out_elempack _sum = _mm512_fmadd_ps(_val_channel8, _conv_w8, _sum); _sum = _mm512_fmadd_ps(_val_channel9, _conv_w9, _sum); - __m512 _conv_wa = _mm512_load_ps(kptr + 160); // 10 * out_elempack - __m512 _conv_wb = _mm512_load_ps(kptr + 176); // 11 * out_elempack + __m512 _conv_wa = _mm512_load_ps(kptr + 160); // 10 * out_elempack + __m512 _conv_wb = _mm512_load_ps(kptr + 176); // 11 * out_elempack _sum = _mm512_fmadd_ps(_val_channela, _conv_wa, _sum); _sum = _mm512_fmadd_ps(_val_channelb, _conv_wb, _sum); - __m512 _conv_wc = _mm512_load_ps(kptr + 192); // 12 * out_elempack - __m512 _conv_wd = _mm512_load_ps(kptr + 208); // 13 * out_elempack + __m512 _conv_wc = _mm512_load_ps(kptr + 192); // 12 * out_elempack + __m512 _conv_wd = _mm512_load_ps(kptr + 208); // 13 * out_elempack _sum = _mm512_fmadd_ps(_val_channelc, _conv_wc, _sum); _sum = _mm512_fmadd_ps(_val_channeld, _conv_wd, _sum); - __m512 _conv_we = _mm512_load_ps(kptr + 224); // 14 * out_elempack - __m512 _conv_wf = _mm512_load_ps(kptr + 240); // 15 * out_elempack + __m512 _conv_we = _mm512_load_ps(kptr + 224); // 14 * out_elempack + __m512 _conv_wf = _mm512_load_ps(kptr + 240); // 15 * out_elempack _sum = _mm512_fmadd_ps(_val_channele, _conv_we, _sum); _sum = _mm512_fmadd_ps(_val_channelf, _conv_wf, _sum); kptr += wstep; diff --git a/src/layer/x86/deformableconv2d_pack16to1.h b/src/layer/x86/deformableconv2d_pack16to1.h index f900464ad4e..c721f5c5233 100644 --- a/src/layer/x86/deformableconv2d_pack16to1.h +++ b/src/layer/x86/deformableconv2d_pack16to1.h @@ -60,7 +60,8 @@ static void deformableconv2d_pack16to1_avx512(const std::vector& bottom_blo { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -73,7 +74,8 @@ static void deformableconv2d_pack16to1_avx512(const std::vector& bottom_blo if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; @@ -325,35 +327,35 @@ static void deformableconv2d_pack16to1_avx512(const std::vector& bottom_blo _val_channelf *= mask_; } float _conv_w0 = *(kptr); - float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack + float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack _sum += (_val_channel0 * _conv_w0); _sum += (_val_channel1 * _conv_w1); - float _conv_w2 = *(kptr + 2); // 2 * out_elempack - float _conv_w3 = *(kptr + 3); // 3 * out_elempack + float _conv_w2 = *(kptr + 2); // 2 * out_elempack + float _conv_w3 = *(kptr + 3); // 3 * out_elempack _sum += (_val_channel2 * _conv_w2); _sum += (_val_channel3 * _conv_w3); - float _conv_w4 = *(kptr + 4); // 4 * out_elempack - float _conv_w5 = *(kptr + 5); // 5 * out_elempack + float _conv_w4 = *(kptr + 4); // 4 * out_elempack + float _conv_w5 = *(kptr + 5); // 5 * out_elempack _sum += (_val_channel4 * _conv_w4); _sum += (_val_channel5 * _conv_w5); - float _conv_w6 = *(kptr + 6); // 6 * out_elempack - float _conv_w7 = *(kptr + 7); // 7 * out_elempack + float _conv_w6 = *(kptr + 6); // 6 * out_elempack + float _conv_w7 = *(kptr + 7); // 7 * out_elempack _sum += (_val_channel6 * _conv_w6); _sum += (_val_channel7 * _conv_w7); - float _conv_w8 = *(kptr + 8); // 8 * out_elempack - float _conv_w9 = *(kptr + 9); // 9 * out_elempack + float _conv_w8 = *(kptr + 8); // 8 * out_elempack + float _conv_w9 = *(kptr + 9); // 9 * out_elempack _sum += (_val_channel8 * _conv_w8); _sum += (_val_channel9 * _conv_w9); - float _conv_wa = *(kptr + 10); // 10 * out_elempack - float _conv_wb = *(kptr + 11); // 11 * out_elempack + float _conv_wa = *(kptr + 10); // 10 * out_elempack + float _conv_wb = *(kptr + 11); // 11 * out_elempack _sum += (_val_channela * _conv_wa); _sum += (_val_channelb * _conv_wb); - float _conv_wc = *(kptr + 12); // 12 * out_elempack - float _conv_wd = *(kptr + 13); // 13 * out_elempack + float _conv_wc = *(kptr + 12); // 12 * out_elempack + float _conv_wd = *(kptr + 13); // 13 * out_elempack _sum += (_val_channelc * _conv_wc); _sum += (_val_channeld * _conv_wd); - float _conv_we = *(kptr + 14); // 14 * out_elempack - float _conv_wf = *(kptr + 15); // 15 * out_elempack + float _conv_we = *(kptr + 14); // 14 * out_elempack + float _conv_wf = *(kptr + 15); // 15 * out_elempack _sum += (_val_channele * _conv_we); _sum += (_val_channelf * _conv_wf); kptr += wstep; diff --git a/src/layer/x86/deformableconv2d_pack16to4.h b/src/layer/x86/deformableconv2d_pack16to4.h index 8307def9948..a75e26ec8cf 100644 --- a/src/layer/x86/deformableconv2d_pack16to4.h +++ b/src/layer/x86/deformableconv2d_pack16to4.h @@ -62,7 +62,8 @@ static void deformableconv2d_pack16to4_avx512(const std::vector& bottom_blo { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -75,7 +76,8 @@ static void deformableconv2d_pack16to4_avx512(const std::vector& bottom_blo if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; @@ -390,35 +392,35 @@ static void deformableconv2d_pack16to4_avx512(const std::vector& bottom_blo _val_channelf = _mm_mul_ps(_val_channelf, _mask); } __m128 _conv_w0 = _mm_load_ps(kptr); - __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack + __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); _sum = _mm_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); __m128 _conv_w2 = _mm_load_ps(kptr + 8); // 2 * out_elempack - __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack + __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack _sum = _mm_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); _sum = _mm_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); - __m128 _conv_w4 = _mm_load_ps(kptr + 16); // 4 * out_elempack - __m128 _conv_w5 = _mm_load_ps(kptr + 20); // 5 * out_elempack + __m128 _conv_w4 = _mm_load_ps(kptr + 16); // 4 * out_elempack + __m128 _conv_w5 = _mm_load_ps(kptr + 20); // 5 * out_elempack _sum = _mm_comp_fmadd_ps(_val_channel4, _conv_w4, _sum); _sum = _mm_comp_fmadd_ps(_val_channel5, _conv_w5, _sum); - __m128 _conv_w6 = _mm_load_ps(kptr + 24); // 6 * out_elempack - __m128 _conv_w7 = _mm_load_ps(kptr + 28); // 7 * out_elempack + __m128 _conv_w6 = _mm_load_ps(kptr + 24); // 6 * out_elempack + __m128 _conv_w7 = _mm_load_ps(kptr + 28); // 7 * out_elempack _sum = _mm_comp_fmadd_ps(_val_channel6, _conv_w6, _sum); _sum = _mm_comp_fmadd_ps(_val_channel7, _conv_w7, _sum); - __m128 _conv_w8 = _mm_load_ps(kptr + 32); // 8 * out_elempack - __m128 _conv_w9 = _mm_load_ps(kptr + 36); // 9 * out_elempack + __m128 _conv_w8 = _mm_load_ps(kptr + 32); // 8 * out_elempack + __m128 _conv_w9 = _mm_load_ps(kptr + 36); // 9 * out_elempack _sum = _mm_comp_fmadd_ps(_val_channel8, _conv_w8, _sum); _sum = _mm_comp_fmadd_ps(_val_channel9, _conv_w9, _sum); - __m128 _conv_wa = _mm_load_ps(kptr + 40); // 10 * out_elempack - __m128 _conv_wb = _mm_load_ps(kptr + 44); // 11 * out_elempack + __m128 _conv_wa = _mm_load_ps(kptr + 40); // 10 * out_elempack + __m128 _conv_wb = _mm_load_ps(kptr + 44); // 11 * out_elempack _sum = _mm_comp_fmadd_ps(_val_channela, _conv_wa, _sum); _sum = _mm_comp_fmadd_ps(_val_channelb, _conv_wb, _sum); - __m128 _conv_wc = _mm_load_ps(kptr + 48); // 12 * out_elempack - __m128 _conv_wd = _mm_load_ps(kptr + 52); // 13 * out_elempack + __m128 _conv_wc = _mm_load_ps(kptr + 48); // 12 * out_elempack + __m128 _conv_wd = _mm_load_ps(kptr + 52); // 13 * out_elempack _sum = _mm_comp_fmadd_ps(_val_channelc, _conv_wc, _sum); _sum = _mm_comp_fmadd_ps(_val_channeld, _conv_wd, _sum); - __m128 _conv_we = _mm_load_ps(kptr + 56); // 14 * out_elempack - __m128 _conv_wf = _mm_load_ps(kptr + 60); // 15 * out_elempack + __m128 _conv_we = _mm_load_ps(kptr + 56); // 14 * out_elempack + __m128 _conv_wf = _mm_load_ps(kptr + 60); // 15 * out_elempack _sum = _mm_comp_fmadd_ps(_val_channele, _conv_we, _sum); _sum = _mm_comp_fmadd_ps(_val_channelf, _conv_wf, _sum); kptr += wstep; diff --git a/src/layer/x86/deformableconv2d_pack16to8.h b/src/layer/x86/deformableconv2d_pack16to8.h index 36c7d222abe..f44fc9ad0c8 100644 --- a/src/layer/x86/deformableconv2d_pack16to8.h +++ b/src/layer/x86/deformableconv2d_pack16to8.h @@ -62,7 +62,8 @@ static void deformableconv2d_pack16to8_avx512(const std::vector& bottom_blo { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -75,7 +76,8 @@ static void deformableconv2d_pack16to8_avx512(const std::vector& bottom_blo if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; @@ -390,35 +392,35 @@ static void deformableconv2d_pack16to8_avx512(const std::vector& bottom_blo _val_channelf = _mm256_mul_ps(_val_channelf, _mask); } __m256 _conv_w0 = _mm256_load_ps(kptr); - __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack + __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); _sum = _mm256_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); - __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack - __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack + __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack + __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack _sum = _mm256_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); _sum = _mm256_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); - __m256 _conv_w4 = _mm256_load_ps(kptr + 32); // 4 * out_elempack - __m256 _conv_w5 = _mm256_load_ps(kptr + 40); // 5 * out_elempack + __m256 _conv_w4 = _mm256_load_ps(kptr + 32); // 4 * out_elempack + __m256 _conv_w5 = _mm256_load_ps(kptr + 40); // 5 * out_elempack _sum = _mm256_comp_fmadd_ps(_val_channel4, _conv_w4, _sum); _sum = _mm256_comp_fmadd_ps(_val_channel5, _conv_w5, _sum); - __m256 _conv_w6 = _mm256_load_ps(kptr + 48); // 6 * out_elempack - __m256 _conv_w7 = _mm256_load_ps(kptr + 56); // 7 * out_elempack + __m256 _conv_w6 = _mm256_load_ps(kptr + 48); // 6 * out_elempack + __m256 _conv_w7 = _mm256_load_ps(kptr + 56); // 7 * out_elempack _sum = _mm256_comp_fmadd_ps(_val_channel6, _conv_w6, _sum); _sum = _mm256_comp_fmadd_ps(_val_channel7, _conv_w7, _sum); - __m256 _conv_w8 = _mm256_load_ps(kptr + 64); // 8 * out_elempack - __m256 _conv_w9 = _mm256_load_ps(kptr + 72); // 9 * out_elempack + __m256 _conv_w8 = _mm256_load_ps(kptr + 64); // 8 * out_elempack + __m256 _conv_w9 = _mm256_load_ps(kptr + 72); // 9 * out_elempack _sum = _mm256_comp_fmadd_ps(_val_channel8, _conv_w8, _sum); _sum = _mm256_comp_fmadd_ps(_val_channel9, _conv_w9, _sum); - __m256 _conv_wa = _mm256_load_ps(kptr + 80); // 10 * out_elempack - __m256 _conv_wb = _mm256_load_ps(kptr + 88); // 11 * out_elempack + __m256 _conv_wa = _mm256_load_ps(kptr + 80); // 10 * out_elempack + __m256 _conv_wb = _mm256_load_ps(kptr + 88); // 11 * out_elempack _sum = _mm256_comp_fmadd_ps(_val_channela, _conv_wa, _sum); _sum = _mm256_comp_fmadd_ps(_val_channelb, _conv_wb, _sum); __m256 _conv_wc = _mm256_load_ps(kptr + 96); // 12 * out_elempack - __m256 _conv_wd = _mm256_load_ps(kptr + 104); // 13 * out_elempack + __m256 _conv_wd = _mm256_load_ps(kptr + 104); // 13 * out_elempack _sum = _mm256_comp_fmadd_ps(_val_channelc, _conv_wc, _sum); _sum = _mm256_comp_fmadd_ps(_val_channeld, _conv_wd, _sum); - __m256 _conv_we = _mm256_load_ps(kptr + 112); // 14 * out_elempack - __m256 _conv_wf = _mm256_load_ps(kptr + 120); // 15 * out_elempack + __m256 _conv_we = _mm256_load_ps(kptr + 112); // 14 * out_elempack + __m256 _conv_wf = _mm256_load_ps(kptr + 120); // 15 * out_elempack _sum = _mm256_comp_fmadd_ps(_val_channele, _conv_we, _sum); _sum = _mm256_comp_fmadd_ps(_val_channelf, _conv_wf, _sum); kptr += wstep; diff --git a/src/layer/x86/deformableconv2d_pack1to16.h b/src/layer/x86/deformableconv2d_pack1to16.h index e3f18e84a22..b50e787e9c8 100644 --- a/src/layer/x86/deformableconv2d_pack1to16.h +++ b/src/layer/x86/deformableconv2d_pack1to16.h @@ -62,7 +62,8 @@ static void deformableconv2d_pack1to16_avx512(const std::vector& bottom_blo { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -75,7 +76,8 @@ static void deformableconv2d_pack1to16_avx512(const std::vector& bottom_blo if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; diff --git a/src/layer/x86/deformableconv2d_pack1to4.h b/src/layer/x86/deformableconv2d_pack1to4.h index a3fb9a19d4a..0388111306f 100644 --- a/src/layer/x86/deformableconv2d_pack1to4.h +++ b/src/layer/x86/deformableconv2d_pack1to4.h @@ -62,7 +62,8 @@ static void deformableconv2d_pack1to4_sse(const std::vector& bottom_blobs, { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -75,7 +76,8 @@ static void deformableconv2d_pack1to4_sse(const std::vector& bottom_blobs, if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; diff --git a/src/layer/x86/deformableconv2d_pack1to8.h b/src/layer/x86/deformableconv2d_pack1to8.h index f607972fa21..fe1e0c8c0a6 100644 --- a/src/layer/x86/deformableconv2d_pack1to8.h +++ b/src/layer/x86/deformableconv2d_pack1to8.h @@ -62,7 +62,8 @@ static void deformableconv2d_pack1to8_avx(const std::vector& bottom_blobs, { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -75,7 +76,8 @@ static void deformableconv2d_pack1to8_avx(const std::vector& bottom_blobs, if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; diff --git a/src/layer/x86/deformableconv2d_pack4.h b/src/layer/x86/deformableconv2d_pack4.h index d6710fdba0e..32b27963fb1 100644 --- a/src/layer/x86/deformableconv2d_pack4.h +++ b/src/layer/x86/deformableconv2d_pack4.h @@ -62,7 +62,8 @@ static void deformableconv2d_pack4_sse(const std::vector& bottom_blobs, Mat { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -75,7 +76,8 @@ static void deformableconv2d_pack4_sse(const std::vector& bottom_blobs, Mat if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; @@ -222,11 +224,11 @@ static void deformableconv2d_pack4_sse(const std::vector& bottom_blobs, Mat _val_channel3 = _mm_mul_ps(_val_channel3, _mask); } __m128 _conv_w0 = _mm_load_ps(kptr); - __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack + __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); _sum = _mm_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); __m128 _conv_w2 = _mm_load_ps(kptr + 8); // 2 * out_elempack - __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack + __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack _sum = _mm_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); _sum = _mm_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); kptr += wstep; diff --git a/src/layer/x86/deformableconv2d_pack4to1.h b/src/layer/x86/deformableconv2d_pack4to1.h index 597fe5e17a7..7ee073a91cb 100644 --- a/src/layer/x86/deformableconv2d_pack4to1.h +++ b/src/layer/x86/deformableconv2d_pack4to1.h @@ -60,7 +60,8 @@ static void deformableconv2d_pack4to1_sse(const std::vector& bottom_blobs, { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -73,7 +74,8 @@ static void deformableconv2d_pack4to1_sse(const std::vector& bottom_blobs, if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; @@ -193,9 +195,9 @@ static void deformableconv2d_pack4to1_sse(const std::vector& bottom_blobs, _val_channel3 *= mask_; } float _conv_w0 = *(kptr); - float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack - float _conv_w2 = *(kptr + 2); // 2 * out_elempack - float _conv_w3 = *(kptr + 3); // 3 * out_elempack + float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack + float _conv_w2 = *(kptr + 2); // 2 * out_elempack + float _conv_w3 = *(kptr + 3); // 3 * out_elempack _sum += (_val_channel0 * _conv_w0 + _val_channel1 * _conv_w1 + _val_channel2 * _conv_w2 + _val_channel3 * _conv_w3); kptr += wstep; } diff --git a/src/layer/x86/deformableconv2d_pack4to16.h b/src/layer/x86/deformableconv2d_pack4to16.h index 74dc663962b..809bb7cb2b5 100644 --- a/src/layer/x86/deformableconv2d_pack4to16.h +++ b/src/layer/x86/deformableconv2d_pack4to16.h @@ -62,7 +62,8 @@ static void deformableconv2d_pack4to16_avx512(const std::vector& bottom_blo { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -75,7 +76,8 @@ static void deformableconv2d_pack4to16_avx512(const std::vector& bottom_blo if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; @@ -222,11 +224,11 @@ static void deformableconv2d_pack4to16_avx512(const std::vector& bottom_blo _val_channel3 = _mm512_mul_ps(_val_channel3, _mask); } __m512 _conv_w0 = _mm512_load_ps(kptr); - __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack + __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum); _sum = _mm512_fmadd_ps(_val_channel1, _conv_w1, _sum); - __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack - __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack + __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack + __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack _sum = _mm512_fmadd_ps(_val_channel2, _conv_w2, _sum); _sum = _mm512_fmadd_ps(_val_channel3, _conv_w3, _sum); kptr += wstep; diff --git a/src/layer/x86/deformableconv2d_pack4to8.h b/src/layer/x86/deformableconv2d_pack4to8.h index fc5830dce57..84099691826 100644 --- a/src/layer/x86/deformableconv2d_pack4to8.h +++ b/src/layer/x86/deformableconv2d_pack4to8.h @@ -62,7 +62,8 @@ static void deformableconv2d_pack4to8_avx(const std::vector& bottom_blobs, { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -75,7 +76,8 @@ static void deformableconv2d_pack4to8_avx(const std::vector& bottom_blobs, if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; @@ -222,11 +224,11 @@ static void deformableconv2d_pack4to8_avx(const std::vector& bottom_blobs, _val_channel3 = _mm256_mul_ps(_val_channel3, _mask); } __m256 _conv_w0 = _mm256_load_ps(kptr); - __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack + __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); _sum = _mm256_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); - __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack - __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack + __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack + __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack _sum = _mm256_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); _sum = _mm256_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); kptr += wstep; diff --git a/src/layer/x86/deformableconv2d_pack8.h b/src/layer/x86/deformableconv2d_pack8.h index 696e4550818..277817e3948 100644 --- a/src/layer/x86/deformableconv2d_pack8.h +++ b/src/layer/x86/deformableconv2d_pack8.h @@ -62,7 +62,8 @@ static void deformableconv2d_pack8_avx(const std::vector& bottom_blobs, Mat { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -75,7 +76,8 @@ static void deformableconv2d_pack8_avx(const std::vector& bottom_blobs, Mat if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; @@ -278,19 +280,19 @@ static void deformableconv2d_pack8_avx(const std::vector& bottom_blobs, Mat _val_channel7 = _mm256_mul_ps(_val_channel7, _mask); } __m256 _conv_w0 = _mm256_load_ps(kptr); - __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack + __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); _sum = _mm256_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); - __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack - __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack + __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack + __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack _sum = _mm256_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); _sum = _mm256_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); - __m256 _conv_w4 = _mm256_load_ps(kptr + 32); // 4 * out_elempack - __m256 _conv_w5 = _mm256_load_ps(kptr + 40); // 5 * out_elempack + __m256 _conv_w4 = _mm256_load_ps(kptr + 32); // 4 * out_elempack + __m256 _conv_w5 = _mm256_load_ps(kptr + 40); // 5 * out_elempack _sum = _mm256_comp_fmadd_ps(_val_channel4, _conv_w4, _sum); _sum = _mm256_comp_fmadd_ps(_val_channel5, _conv_w5, _sum); - __m256 _conv_w6 = _mm256_load_ps(kptr + 48); // 6 * out_elempack - __m256 _conv_w7 = _mm256_load_ps(kptr + 56); // 7 * out_elempack + __m256 _conv_w6 = _mm256_load_ps(kptr + 48); // 6 * out_elempack + __m256 _conv_w7 = _mm256_load_ps(kptr + 56); // 7 * out_elempack _sum = _mm256_comp_fmadd_ps(_val_channel6, _conv_w6, _sum); _sum = _mm256_comp_fmadd_ps(_val_channel7, _conv_w7, _sum); kptr += wstep; diff --git a/src/layer/x86/deformableconv2d_pack8to1.h b/src/layer/x86/deformableconv2d_pack8to1.h index 17a01c7b3bf..c4b97b40f06 100644 --- a/src/layer/x86/deformableconv2d_pack8to1.h +++ b/src/layer/x86/deformableconv2d_pack8to1.h @@ -60,7 +60,8 @@ static void deformableconv2d_pack8to1_avx(const std::vector& bottom_blobs, { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -73,7 +74,8 @@ static void deformableconv2d_pack8to1_avx(const std::vector& bottom_blobs, if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; @@ -237,13 +239,13 @@ static void deformableconv2d_pack8to1_avx(const std::vector& bottom_blobs, _val_channel7 *= mask_; } float _conv_w0 = *(kptr); - float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack - float _conv_w2 = *(kptr + 2); // 2 * out_elempack - float _conv_w3 = *(kptr + 3); // 3 * out_elempack - float _conv_w4 = *(kptr + 4); // 4 * out_elempack - float _conv_w5 = *(kptr + 5); // 5 * out_elempack - float _conv_w6 = *(kptr + 6); // 6 * out_elempack - float _conv_w7 = *(kptr + 7); // 7 * out_elempack + float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack + float _conv_w2 = *(kptr + 2); // 2 * out_elempack + float _conv_w3 = *(kptr + 3); // 3 * out_elempack + float _conv_w4 = *(kptr + 4); // 4 * out_elempack + float _conv_w5 = *(kptr + 5); // 5 * out_elempack + float _conv_w6 = *(kptr + 6); // 6 * out_elempack + float _conv_w7 = *(kptr + 7); // 7 * out_elempack _sum += (_val_channel0 * _conv_w0 + _val_channel1 * _conv_w1 + _val_channel2 * _conv_w2 + _val_channel3 * _conv_w3 + _val_channel4 * _conv_w4 + _val_channel5 * _conv_w5 + _val_channel6 * _conv_w6 + _val_channel7 * _conv_w7); kptr += wstep; } diff --git a/src/layer/x86/deformableconv2d_pack8to16.h b/src/layer/x86/deformableconv2d_pack8to16.h index 1c77ac8f6a9..15e5ed076e6 100644 --- a/src/layer/x86/deformableconv2d_pack8to16.h +++ b/src/layer/x86/deformableconv2d_pack8to16.h @@ -62,7 +62,8 @@ static void deformableconv2d_pack8to16_avx512(const std::vector& bottom_blo { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -75,7 +76,8 @@ static void deformableconv2d_pack8to16_avx512(const std::vector& bottom_blo if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; @@ -278,19 +280,19 @@ static void deformableconv2d_pack8to16_avx512(const std::vector& bottom_blo _val_channel7 = _mm512_mul_ps(_val_channel7, _mask); } __m512 _conv_w0 = _mm512_load_ps(kptr); - __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack + __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum); _sum = _mm512_fmadd_ps(_val_channel1, _conv_w1, _sum); - __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack - __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack + __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack + __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack _sum = _mm512_fmadd_ps(_val_channel2, _conv_w2, _sum); _sum = _mm512_fmadd_ps(_val_channel3, _conv_w3, _sum); - __m512 _conv_w4 = _mm512_load_ps(kptr + 64); // 4 * out_elempack - __m512 _conv_w5 = _mm512_load_ps(kptr + 80); // 5 * out_elempack + __m512 _conv_w4 = _mm512_load_ps(kptr + 64); // 4 * out_elempack + __m512 _conv_w5 = _mm512_load_ps(kptr + 80); // 5 * out_elempack _sum = _mm512_fmadd_ps(_val_channel4, _conv_w4, _sum); _sum = _mm512_fmadd_ps(_val_channel5, _conv_w5, _sum); __m512 _conv_w6 = _mm512_load_ps(kptr + 96); // 6 * out_elempack - __m512 _conv_w7 = _mm512_load_ps(kptr + 112); // 7 * out_elempack + __m512 _conv_w7 = _mm512_load_ps(kptr + 112); // 7 * out_elempack _sum = _mm512_fmadd_ps(_val_channel6, _conv_w6, _sum); _sum = _mm512_fmadd_ps(_val_channel7, _conv_w7, _sum); kptr += wstep; diff --git a/src/layer/x86/deformableconv2d_pack8to4.h b/src/layer/x86/deformableconv2d_pack8to4.h index 0c905d8ff62..85aa06aaa03 100644 --- a/src/layer/x86/deformableconv2d_pack8to4.h +++ b/src/layer/x86/deformableconv2d_pack8to4.h @@ -62,7 +62,8 @@ static void deformableconv2d_pack8to4_avx(const std::vector& bottom_blobs, { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -75,7 +76,8 @@ static void deformableconv2d_pack8to4_avx(const std::vector& bottom_blobs, if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; @@ -278,19 +280,19 @@ static void deformableconv2d_pack8to4_avx(const std::vector& bottom_blobs, _val_channel7 = _mm_mul_ps(_val_channel7, _mask); } __m128 _conv_w0 = _mm_load_ps(kptr); - __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack + __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); _sum = _mm_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); __m128 _conv_w2 = _mm_load_ps(kptr + 8); // 2 * out_elempack - __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack + __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack _sum = _mm_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); _sum = _mm_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); - __m128 _conv_w4 = _mm_load_ps(kptr + 16); // 4 * out_elempack - __m128 _conv_w5 = _mm_load_ps(kptr + 20); // 5 * out_elempack + __m128 _conv_w4 = _mm_load_ps(kptr + 16); // 4 * out_elempack + __m128 _conv_w5 = _mm_load_ps(kptr + 20); // 5 * out_elempack _sum = _mm_comp_fmadd_ps(_val_channel4, _conv_w4, _sum); _sum = _mm_comp_fmadd_ps(_val_channel5, _conv_w5, _sum); - __m128 _conv_w6 = _mm_load_ps(kptr + 24); // 6 * out_elempack - __m128 _conv_w7 = _mm_load_ps(kptr + 28); // 7 * out_elempack + __m128 _conv_w6 = _mm_load_ps(kptr + 24); // 6 * out_elempack + __m128 _conv_w7 = _mm_load_ps(kptr + 28); // 7 * out_elempack _sum = _mm_comp_fmadd_ps(_val_channel6, _conv_w6, _sum); _sum = _mm_comp_fmadd_ps(_val_channel7, _conv_w7, _sum); kptr += wstep; diff --git a/src/layer/x86/deformableconv2d_sgemm.h b/src/layer/x86/deformableconv2d_sgemm.h index 7efaa50ec65..648af448b12 100644 --- a/src/layer/x86/deformableconv2d_sgemm.h +++ b/src/layer/x86/deformableconv2d_sgemm.h @@ -53,7 +53,8 @@ static void deformableconv2d_im2col_sgemm_sse(const std::vector& bottom_blo { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -66,7 +67,8 @@ static void deformableconv2d_im2col_sgemm_sse(const std::vector& bottom_blo if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16.h b/src/layer/x86/deformableconv2d_sgemm_pack16.h index d633b1d2eb8..37aab40f1e4 100644 --- a/src/layer/x86/deformableconv2d_sgemm_pack16.h +++ b/src/layer/x86/deformableconv2d_sgemm_pack16.h @@ -56,7 +56,8 @@ static void deformableconv2d_im2col_sgemm_pack16_avx512(const std::vector& { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -69,7 +70,8 @@ static void deformableconv2d_im2col_sgemm_pack16_avx512(const std::vector& if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16to1.h b/src/layer/x86/deformableconv2d_sgemm_pack16to1.h index 9b341a6c44c..686333e6ee4 100644 --- a/src/layer/x86/deformableconv2d_sgemm_pack16to1.h +++ b/src/layer/x86/deformableconv2d_sgemm_pack16to1.h @@ -56,7 +56,8 @@ static void deformableconv2d_im2col_sgemm_pack16to1_avx512(const std::vector& b { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -66,7 +67,8 @@ static void deformableconv2d_im2col_sgemm_pack1to4_sse(const std::vector& b if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; diff --git a/src/layer/x86/deformableconv2d_sgemm_pack1to8.h b/src/layer/x86/deformableconv2d_sgemm_pack1to8.h index ba5315f04aa..d02c4245d7c 100644 --- a/src/layer/x86/deformableconv2d_sgemm_pack1to8.h +++ b/src/layer/x86/deformableconv2d_sgemm_pack1to8.h @@ -53,7 +53,8 @@ static void deformableconv2d_im2col_sgemm_pack1to8_avx(const std::vector& b { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -66,7 +67,8 @@ static void deformableconv2d_im2col_sgemm_pack1to8_avx(const std::vector& b if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4.h b/src/layer/x86/deformableconv2d_sgemm_pack4.h index 794baab3415..140fa78e522 100644 --- a/src/layer/x86/deformableconv2d_sgemm_pack4.h +++ b/src/layer/x86/deformableconv2d_sgemm_pack4.h @@ -56,7 +56,8 @@ static void deformableconv2d_im2col_sgemm_pack4_sse(const std::vector& bott { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -69,7 +70,8 @@ static void deformableconv2d_im2col_sgemm_pack4_sse(const std::vector& bott if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4to1.h b/src/layer/x86/deformableconv2d_sgemm_pack4to1.h index e30bb60c518..d5d7b57cab5 100644 --- a/src/layer/x86/deformableconv2d_sgemm_pack4to1.h +++ b/src/layer/x86/deformableconv2d_sgemm_pack4to1.h @@ -56,7 +56,8 @@ static void deformableconv2d_im2col_sgemm_pack4to1_sse(const std::vector& b { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -69,7 +70,8 @@ static void deformableconv2d_im2col_sgemm_pack4to1_sse(const std::vector& b if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4to16.h b/src/layer/x86/deformableconv2d_sgemm_pack4to16.h index ead34f1c6df..7eef68bb01a 100644 --- a/src/layer/x86/deformableconv2d_sgemm_pack4to16.h +++ b/src/layer/x86/deformableconv2d_sgemm_pack4to16.h @@ -56,7 +56,8 @@ static void deformableconv2d_im2col_sgemm_pack4to16_avx512(const std::vector& b { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -69,7 +70,8 @@ static void deformableconv2d_im2col_sgemm_pack4to8_avx(const std::vector& b if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8.h b/src/layer/x86/deformableconv2d_sgemm_pack8.h index 9494cece461..fce55606859 100644 --- a/src/layer/x86/deformableconv2d_sgemm_pack8.h +++ b/src/layer/x86/deformableconv2d_sgemm_pack8.h @@ -56,7 +56,8 @@ static void deformableconv2d_im2col_sgemm_pack8_avx(const std::vector& bott { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -69,7 +70,8 @@ static void deformableconv2d_im2col_sgemm_pack8_avx(const std::vector& bott if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8to1.h b/src/layer/x86/deformableconv2d_sgemm_pack8to1.h index 0ce5558c328..635c08625ab 100644 --- a/src/layer/x86/deformableconv2d_sgemm_pack8to1.h +++ b/src/layer/x86/deformableconv2d_sgemm_pack8to1.h @@ -56,7 +56,8 @@ static void deformableconv2d_im2col_sgemm_pack8to1_avx(const std::vector& b { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -69,7 +70,8 @@ static void deformableconv2d_im2col_sgemm_pack8to1_avx(const std::vector& b if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8to16.h b/src/layer/x86/deformableconv2d_sgemm_pack8to16.h index f32d0d5470f..161e983f1a0 100644 --- a/src/layer/x86/deformableconv2d_sgemm_pack8to16.h +++ b/src/layer/x86/deformableconv2d_sgemm_pack8to16.h @@ -56,7 +56,8 @@ static void deformableconv2d_im2col_sgemm_pack8to16_avx512(const std::vector& b { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -69,7 +70,8 @@ static void deformableconv2d_im2col_sgemm_pack8to4_avx(const std::vector& b if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp index 88e9829aa8f..caff2e17d06 100644 --- a/src/layer/x86/deformableconv2d_x86.cpp +++ b/src/layer/x86/deformableconv2d_x86.cpp @@ -719,7 +719,8 @@ int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vec { offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; - }else + } + else { const int y_c = (i * kernel_w + j) * 2; const int x_c = (i * kernel_w + j) * 2 + 1; @@ -732,7 +733,8 @@ int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vec if (mask_not_pack) { mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; - }else + } + else { const int m_c = i * kernel_w + j; mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];