diff --git a/src/layer/x86/deformableconv2d_pack16.h b/src/layer/x86/deformableconv2d_pack16.h new file mode 100644 index 00000000000..42f260f6e96 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack16.h @@ -0,0 +1,435 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 16; + const int out_elempack = 16; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m512 _sum = _mm512_setzero_ps(); + if (bias_data_ptr) + _sum = _mm512_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val_channel0 = _mm512_loadu_ps(zeros_ptr); + __m512 _val_channel1 = _val_channel0; + __m512 _val_channel2 = _val_channel0; + __m512 _val_channel3 = _val_channel0; + __m512 _val_channel4 = _val_channel0; + __m512 _val_channel5 = _val_channel0; + __m512 _val_channel6 = _val_channel0; + __m512 _val_channel7 = _val_channel0; + __m512 _val_channel8 = _val_channel0; + __m512 _val_channel9 = _val_channel0; + __m512 _val_channela = _val_channel0; + __m512 _val_channelb = _val_channel0; + __m512 _val_channelc = _val_channel0; + __m512 _val_channeld = _val_channel0; + __m512 _val_channele = _val_channel0; + __m512 _val_channelf = _val_channel0; + if (cond) + { + __m512 _v1_channel0 = _val_channel0; + __m512 _v1_channel1 = _val_channel0; + __m512 _v1_channel2 = _val_channel0; + __m512 _v1_channel3 = _val_channel0; + __m512 _v1_channel4 = _val_channel0; + __m512 _v1_channel5 = _val_channel0; + __m512 _v1_channel6 = _val_channel0; + __m512 _v1_channel7 = _val_channel0; + __m512 _v1_channel8 = _val_channel0; + __m512 _v1_channel9 = _val_channel0; + __m512 _v1_channela = _val_channel0; + __m512 _v1_channelb = _val_channel0; + __m512 _v1_channelc = _val_channel0; + __m512 _v1_channeld = _val_channel0; + __m512 _v1_channele = _val_channel0; + __m512 _v1_channelf = _val_channel0; + __m512 _v2_channel0 = _val_channel0; + __m512 _v2_channel1 = _val_channel0; + __m512 _v2_channel2 = _val_channel0; + __m512 _v2_channel3 = _val_channel0; + __m512 _v2_channel4 = _val_channel0; + __m512 _v2_channel5 = _val_channel0; + __m512 _v2_channel6 = _val_channel0; + __m512 _v2_channel7 = _val_channel0; + __m512 _v2_channel8 = _val_channel0; + __m512 _v2_channel9 = _val_channel0; + __m512 _v2_channela = _val_channel0; + __m512 _v2_channelb = _val_channel0; + __m512 _v2_channelc = _val_channel0; + __m512 _v2_channeld = _val_channel0; + __m512 _v2_channele = _val_channel0; + __m512 _v2_channelf = _val_channel0; + __m512 _v3_channel0 = _val_channel0; + __m512 _v3_channel1 = _val_channel0; + __m512 _v3_channel2 = _val_channel0; + __m512 _v3_channel3 = _val_channel0; + __m512 _v3_channel4 = _val_channel0; + __m512 _v3_channel5 = _val_channel0; + __m512 _v3_channel6 = _val_channel0; + __m512 _v3_channel7 = _val_channel0; + __m512 _v3_channel8 = _val_channel0; + __m512 _v3_channel9 = _val_channel0; + __m512 _v3_channela = _val_channel0; + __m512 _v3_channelb = _val_channel0; + __m512 _v3_channelc = _val_channel0; + __m512 _v3_channeld = _val_channel0; + __m512 _v3_channele = _val_channel0; + __m512 _v3_channelf = _val_channel0; + __m512 _v4_channel0 = _val_channel0; + __m512 _v4_channel1 = _val_channel0; + __m512 _v4_channel2 = _val_channel0; + __m512 _v4_channel3 = _val_channel0; + __m512 _v4_channel4 = _val_channel0; + __m512 _v4_channel5 = _val_channel0; + __m512 _v4_channel6 = _val_channel0; + __m512 _v4_channel7 = _val_channel0; + __m512 _v4_channel8 = _val_channel0; + __m512 _v4_channel9 = _val_channel0; + __m512 _v4_channela = _val_channel0; + __m512 _v4_channelb = _val_channel0; + __m512 _v4_channelc = _val_channel0; + __m512 _v4_channeld = _val_channel0; + __m512 _v4_channele = _val_channel0; + __m512 _v4_channelf = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack]); + _v1_channel1 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 1]); + _v1_channel2 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 2]); + _v1_channel3 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 3]); + _v1_channel4 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 4]); + _v1_channel5 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 5]); + _v1_channel6 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 6]); + _v1_channel7 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 7]); + _v1_channel8 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 8]); + _v1_channel9 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 9]); + _v1_channela = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 10]); + _v1_channelb = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 11]); + _v1_channelc = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 12]); + _v1_channeld = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 13]); + _v1_channele = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 14]); + _v1_channelf = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 15]); + } + if (v2_cond) + { + _v2_channel0 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack]); + _v2_channel1 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 1]); + _v2_channel2 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 2]); + _v2_channel3 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 3]); + _v2_channel4 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 4]); + _v2_channel5 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 5]); + _v2_channel6 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 6]); + _v2_channel7 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 7]); + _v2_channel8 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 8]); + _v2_channel9 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 9]); + _v2_channela = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 10]); + _v2_channelb = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 11]); + _v2_channelc = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 12]); + _v2_channeld = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 13]); + _v2_channele = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 14]); + _v2_channelf = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 15]); + } + if (v3_cond) + { + _v3_channel0 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack]); + _v3_channel1 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 1]); + _v3_channel2 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 2]); + _v3_channel3 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 3]); + _v3_channel4 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 4]); + _v3_channel5 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 5]); + _v3_channel6 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 6]); + _v3_channel7 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 7]); + _v3_channel8 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 8]); + _v3_channel9 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 9]); + _v3_channela = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 10]); + _v3_channelb = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 11]); + _v3_channelc = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 12]); + _v3_channeld = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 13]); + _v3_channele = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 14]); + _v3_channelf = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 15]); + } + if (v4_cond) + { + _v4_channel0 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack]); + _v4_channel1 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 1]); + _v4_channel2 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 2]); + _v4_channel3 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 3]); + _v4_channel4 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 4]); + _v4_channel5 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 5]); + _v4_channel6 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 6]); + _v4_channel7 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 7]); + _v4_channel8 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 8]); + _v4_channel9 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 9]); + _v4_channela = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 10]); + _v4_channelb = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 11]); + _v4_channelc = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 12]); + _v4_channeld = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 13]); + _v4_channele = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 14]); + _v4_channelf = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 15]); + } + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val_channel0 = _mm512_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm512_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm512_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm512_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm512_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm512_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm512_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm512_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v4_channel7, _w4, _val_channel7); + _val_channel8 = _mm512_fmadd_ps(_v1_channel8, _w1, _val_channel8); + _val_channel8 = _mm512_fmadd_ps(_v2_channel8, _w2, _val_channel8); + _val_channel8 = _mm512_fmadd_ps(_v3_channel8, _w3, _val_channel8); + _val_channel8 = _mm512_fmadd_ps(_v4_channel8, _w4, _val_channel8); + _val_channel9 = _mm512_fmadd_ps(_v1_channel9, _w1, _val_channel9); + _val_channel9 = _mm512_fmadd_ps(_v2_channel9, _w2, _val_channel9); + _val_channel9 = _mm512_fmadd_ps(_v3_channel9, _w3, _val_channel9); + _val_channel9 = _mm512_fmadd_ps(_v4_channel9, _w4, _val_channel9); + _val_channela = _mm512_fmadd_ps(_v1_channela, _w1, _val_channela); + _val_channela = _mm512_fmadd_ps(_v2_channela, _w2, _val_channela); + _val_channela = _mm512_fmadd_ps(_v3_channela, _w3, _val_channela); + _val_channela = _mm512_fmadd_ps(_v4_channela, _w4, _val_channela); + _val_channelb = _mm512_fmadd_ps(_v1_channelb, _w1, _val_channelb); + _val_channelb = _mm512_fmadd_ps(_v2_channelb, _w2, _val_channelb); + _val_channelb = _mm512_fmadd_ps(_v3_channelb, _w3, _val_channelb); + _val_channelb = _mm512_fmadd_ps(_v4_channelb, _w4, _val_channelb); + _val_channelc = _mm512_fmadd_ps(_v1_channelc, _w1, _val_channelc); + _val_channelc = _mm512_fmadd_ps(_v2_channelc, _w2, _val_channelc); + _val_channelc = _mm512_fmadd_ps(_v3_channelc, _w3, _val_channelc); + _val_channelc = _mm512_fmadd_ps(_v4_channelc, _w4, _val_channelc); + _val_channeld = _mm512_fmadd_ps(_v1_channeld, _w1, _val_channeld); + _val_channeld = _mm512_fmadd_ps(_v2_channeld, _w2, _val_channeld); + _val_channeld = _mm512_fmadd_ps(_v3_channeld, _w3, _val_channeld); + _val_channeld = _mm512_fmadd_ps(_v4_channeld, _w4, _val_channeld); + _val_channele = _mm512_fmadd_ps(_v1_channele, _w1, _val_channele); + _val_channele = _mm512_fmadd_ps(_v2_channele, _w2, _val_channele); + _val_channele = _mm512_fmadd_ps(_v3_channele, _w3, _val_channele); + _val_channele = _mm512_fmadd_ps(_v4_channele, _w4, _val_channele); + _val_channelf = _mm512_fmadd_ps(_v1_channelf, _w1, _val_channelf); + _val_channelf = _mm512_fmadd_ps(_v2_channelf, _w2, _val_channelf); + _val_channelf = _mm512_fmadd_ps(_v3_channelf, _w3, _val_channelf); + _val_channelf = _mm512_fmadd_ps(_v4_channelf, _w4, _val_channelf); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val_channel0 = _mm512_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm512_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm512_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm512_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm512_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm512_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm512_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm512_mul_ps(_val_channel7, _mask); + _val_channel8 = _mm512_mul_ps(_val_channel8, _mask); + _val_channel9 = _mm512_mul_ps(_val_channel9, _mask); + _val_channela = _mm512_mul_ps(_val_channela, _mask); + _val_channelb = _mm512_mul_ps(_val_channelb, _mask); + _val_channelc = _mm512_mul_ps(_val_channelc, _mask); + _val_channeld = _mm512_mul_ps(_val_channeld, _mask); + _val_channele = _mm512_mul_ps(_val_channele, _mask); + _val_channelf = _mm512_mul_ps(_val_channelf, _mask); + } + __m512 _conv_w0 = _mm512_load_ps(kptr); + __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm512_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack + __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm512_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m512 _conv_w4 = _mm512_load_ps(kptr + 64); // 4 * out_elempack + __m512 _conv_w5 = _mm512_load_ps(kptr + 80); // 5 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm512_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m512 _conv_w6 = _mm512_load_ps(kptr + 96); // 6 * out_elempack + __m512 _conv_w7 = _mm512_load_ps(kptr + 112); // 7 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm512_fmadd_ps(_val_channel7, _conv_w7, _sum); + __m512 _conv_w8 = _mm512_load_ps(kptr + 128); // 8 * out_elempack + __m512 _conv_w9 = _mm512_load_ps(kptr + 144); // 9 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel8, _conv_w8, _sum); + _sum = _mm512_fmadd_ps(_val_channel9, _conv_w9, _sum); + __m512 _conv_wa = _mm512_load_ps(kptr + 160); // 10 * out_elempack + __m512 _conv_wb = _mm512_load_ps(kptr + 176); // 11 * out_elempack + _sum = _mm512_fmadd_ps(_val_channela, _conv_wa, _sum); + _sum = _mm512_fmadd_ps(_val_channelb, _conv_wb, _sum); + __m512 _conv_wc = _mm512_load_ps(kptr + 192); // 12 * out_elempack + __m512 _conv_wd = _mm512_load_ps(kptr + 208); // 13 * out_elempack + _sum = _mm512_fmadd_ps(_val_channelc, _conv_wc, _sum); + _sum = _mm512_fmadd_ps(_val_channeld, _conv_wd, _sum); + __m512 _conv_we = _mm512_load_ps(kptr + 224); // 14 * out_elempack + __m512 _conv_wf = _mm512_load_ps(kptr + 240); // 15 * out_elempack + _sum = _mm512_fmadd_ps(_val_channele, _conv_we, _sum); + _sum = _mm512_fmadd_ps(_val_channelf, _conv_wf, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx512(_sum, activation_type, activation_params); + _mm512_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack16to1.h b/src/layer/x86/deformableconv2d_pack16to1.h new file mode 100644 index 00000000000..c721f5c5233 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack16to1.h @@ -0,0 +1,370 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack16to1_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 16; + const int out_elempack = 1; + const int wstep = out_elempack * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + float _sum = 0.f; + if (bias_data_ptr) + _sum = *(bias_data_ptr + oc); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + float _val_channel0 = 0.f; + float _val_channel1 = _val_channel0; + float _val_channel2 = _val_channel0; + float _val_channel3 = _val_channel0; + float _val_channel4 = _val_channel0; + float _val_channel5 = _val_channel0; + float _val_channel6 = _val_channel0; + float _val_channel7 = _val_channel0; + float _val_channel8 = _val_channel0; + float _val_channel9 = _val_channel0; + float _val_channela = _val_channel0; + float _val_channelb = _val_channel0; + float _val_channelc = _val_channel0; + float _val_channeld = _val_channel0; + float _val_channele = _val_channel0; + float _val_channelf = _val_channel0; + if (cond) + { + float _v1_channel0 = _val_channel0; + float _v1_channel1 = _val_channel0; + float _v1_channel2 = _val_channel0; + float _v1_channel3 = _val_channel0; + float _v1_channel4 = _val_channel0; + float _v1_channel5 = _val_channel0; + float _v1_channel6 = _val_channel0; + float _v1_channel7 = _val_channel0; + float _v1_channel8 = _val_channel0; + float _v1_channel9 = _val_channel0; + float _v1_channela = _val_channel0; + float _v1_channelb = _val_channel0; + float _v1_channelc = _val_channel0; + float _v1_channeld = _val_channel0; + float _v1_channele = _val_channel0; + float _v1_channelf = _val_channel0; + float _v2_channel0 = _val_channel0; + float _v2_channel1 = _val_channel0; + float _v2_channel2 = _val_channel0; + float _v2_channel3 = _val_channel0; + float _v2_channel4 = _val_channel0; + float _v2_channel5 = _val_channel0; + float _v2_channel6 = _val_channel0; + float _v2_channel7 = _val_channel0; + float _v2_channel8 = _val_channel0; + float _v2_channel9 = _val_channel0; + float _v2_channela = _val_channel0; + float _v2_channelb = _val_channel0; + float _v2_channelc = _val_channel0; + float _v2_channeld = _val_channel0; + float _v2_channele = _val_channel0; + float _v2_channelf = _val_channel0; + float _v3_channel0 = _val_channel0; + float _v3_channel1 = _val_channel0; + float _v3_channel2 = _val_channel0; + float _v3_channel3 = _val_channel0; + float _v3_channel4 = _val_channel0; + float _v3_channel5 = _val_channel0; + float _v3_channel6 = _val_channel0; + float _v3_channel7 = _val_channel0; + float _v3_channel8 = _val_channel0; + float _v3_channel9 = _val_channel0; + float _v3_channela = _val_channel0; + float _v3_channelb = _val_channel0; + float _v3_channelc = _val_channel0; + float _v3_channeld = _val_channel0; + float _v3_channele = _val_channel0; + float _v3_channelf = _val_channel0; + float _v4_channel0 = _val_channel0; + float _v4_channel1 = _val_channel0; + float _v4_channel2 = _val_channel0; + float _v4_channel3 = _val_channel0; + float _v4_channel4 = _val_channel0; + float _v4_channel5 = _val_channel0; + float _v4_channel6 = _val_channel0; + float _v4_channel7 = _val_channel0; + float _v4_channel8 = _val_channel0; + float _v4_channel9 = _val_channel0; + float _v4_channela = _val_channel0; + float _v4_channelb = _val_channel0; + float _v4_channelc = _val_channel0; + float _v4_channeld = _val_channel0; + float _v4_channele = _val_channel0; + float _v4_channelf = _val_channel0; + if (v1_cond) + { + _v1_channel0 = *(data_im_ptr + v1_pos * elempack); + _v1_channel1 = *(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = *(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = *(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = *(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = *(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = *(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = *(data_im_ptr + v1_pos * elempack + 7); + _v1_channel8 = *(data_im_ptr + v1_pos * elempack + 8); + _v1_channel9 = *(data_im_ptr + v1_pos * elempack + 9); + _v1_channela = *(data_im_ptr + v1_pos * elempack + 10); + _v1_channelb = *(data_im_ptr + v1_pos * elempack + 11); + _v1_channelc = *(data_im_ptr + v1_pos * elempack + 12); + _v1_channeld = *(data_im_ptr + v1_pos * elempack + 13); + _v1_channele = *(data_im_ptr + v1_pos * elempack + 14); + _v1_channelf = *(data_im_ptr + v1_pos * elempack + 15); + } + if (v2_cond) + { + _v2_channel0 = *(data_im_ptr + v2_pos * elempack); + _v2_channel1 = *(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = *(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = *(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = *(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = *(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = *(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = *(data_im_ptr + v2_pos * elempack + 7); + _v2_channel8 = *(data_im_ptr + v2_pos * elempack + 8); + _v2_channel9 = *(data_im_ptr + v2_pos * elempack + 9); + _v2_channela = *(data_im_ptr + v2_pos * elempack + 10); + _v2_channelb = *(data_im_ptr + v2_pos * elempack + 11); + _v2_channelc = *(data_im_ptr + v2_pos * elempack + 12); + _v2_channeld = *(data_im_ptr + v2_pos * elempack + 13); + _v2_channele = *(data_im_ptr + v2_pos * elempack + 14); + _v2_channelf = *(data_im_ptr + v2_pos * elempack + 15); + } + if (v3_cond) + { + _v3_channel0 = *(data_im_ptr + v3_pos * elempack); + _v3_channel1 = *(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = *(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = *(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = *(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = *(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = *(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = *(data_im_ptr + v3_pos * elempack + 7); + _v3_channel8 = *(data_im_ptr + v3_pos * elempack + 8); + _v3_channel9 = *(data_im_ptr + v3_pos * elempack + 9); + _v3_channela = *(data_im_ptr + v3_pos * elempack + 10); + _v3_channelb = *(data_im_ptr + v3_pos * elempack + 11); + _v3_channelc = *(data_im_ptr + v3_pos * elempack + 12); + _v3_channeld = *(data_im_ptr + v3_pos * elempack + 13); + _v3_channele = *(data_im_ptr + v3_pos * elempack + 14); + _v3_channelf = *(data_im_ptr + v3_pos * elempack + 15); + } + if (v4_cond) + { + _v4_channel0 = *(data_im_ptr + v4_pos * elempack); + _v4_channel1 = *(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = *(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = *(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = *(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = *(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = *(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = *(data_im_ptr + v4_pos * elempack + 7); + _v4_channel8 = *(data_im_ptr + v4_pos * elempack + 8); + _v4_channel9 = *(data_im_ptr + v4_pos * elempack + 9); + _v4_channela = *(data_im_ptr + v4_pos * elempack + 10); + _v4_channelb = *(data_im_ptr + v4_pos * elempack + 11); + _v4_channelc = *(data_im_ptr + v4_pos * elempack + 12); + _v4_channeld = *(data_im_ptr + v4_pos * elempack + 13); + _v4_channele = *(data_im_ptr + v4_pos * elempack + 14); + _v4_channelf = *(data_im_ptr + v4_pos * elempack + 15); + } + _val_channel0 = w1 * _v1_channel0 + w2 * _v2_channel0 + w3 * _v3_channel0 + w4 * _v4_channel0; + _val_channel1 = w1 * _v1_channel1 + w2 * _v2_channel1 + w3 * _v3_channel1 + w4 * _v4_channel1; + _val_channel2 = w1 * _v1_channel2 + w2 * _v2_channel2 + w3 * _v3_channel2 + w4 * _v4_channel2; + _val_channel3 = w1 * _v1_channel3 + w2 * _v2_channel3 + w3 * _v3_channel3 + w4 * _v4_channel3; + _val_channel4 = w1 * _v1_channel4 + w2 * _v2_channel4 + w3 * _v3_channel4 + w4 * _v4_channel4; + _val_channel5 = w1 * _v1_channel5 + w2 * _v2_channel5 + w3 * _v3_channel5 + w4 * _v4_channel5; + _val_channel6 = w1 * _v1_channel6 + w2 * _v2_channel6 + w3 * _v3_channel6 + w4 * _v4_channel6; + _val_channel7 = w1 * _v1_channel7 + w2 * _v2_channel7 + w3 * _v3_channel7 + w4 * _v4_channel7; + _val_channel8 = w1 * _v1_channel8 + w2 * _v2_channel8 + w3 * _v3_channel8 + w4 * _v4_channel8; + _val_channel9 = w1 * _v1_channel9 + w2 * _v2_channel9 + w3 * _v3_channel9 + w4 * _v4_channel9; + _val_channela = w1 * _v1_channela + w2 * _v2_channela + w3 * _v3_channela + w4 * _v4_channela; + _val_channelb = w1 * _v1_channelb + w2 * _v2_channelb + w3 * _v3_channelb + w4 * _v4_channelb; + _val_channelc = w1 * _v1_channelc + w2 * _v2_channelc + w3 * _v3_channelc + w4 * _v4_channelc; + _val_channeld = w1 * _v1_channeld + w2 * _v2_channeld + w3 * _v3_channeld + w4 * _v4_channeld; + _val_channele = w1 * _v1_channele + w2 * _v2_channele + w3 * _v3_channele + w4 * _v4_channele; + _val_channelf = w1 * _v1_channelf + w2 * _v2_channelf + w3 * _v3_channelf + w4 * _v4_channelf; + } + if (has_mask) + { + _val_channel0 *= mask_; + _val_channel1 *= mask_; + _val_channel2 *= mask_; + _val_channel3 *= mask_; + _val_channel4 *= mask_; + _val_channel5 *= mask_; + _val_channel6 *= mask_; + _val_channel7 *= mask_; + _val_channel8 *= mask_; + _val_channel9 *= mask_; + _val_channela *= mask_; + _val_channelb *= mask_; + _val_channelc *= mask_; + _val_channeld *= mask_; + _val_channele *= mask_; + _val_channelf *= mask_; + } + float _conv_w0 = *(kptr); + float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack + _sum += (_val_channel0 * _conv_w0); + _sum += (_val_channel1 * _conv_w1); + float _conv_w2 = *(kptr + 2); // 2 * out_elempack + float _conv_w3 = *(kptr + 3); // 3 * out_elempack + _sum += (_val_channel2 * _conv_w2); + _sum += (_val_channel3 * _conv_w3); + float _conv_w4 = *(kptr + 4); // 4 * out_elempack + float _conv_w5 = *(kptr + 5); // 5 * out_elempack + _sum += (_val_channel4 * _conv_w4); + _sum += (_val_channel5 * _conv_w5); + float _conv_w6 = *(kptr + 6); // 6 * out_elempack + float _conv_w7 = *(kptr + 7); // 7 * out_elempack + _sum += (_val_channel6 * _conv_w6); + _sum += (_val_channel7 * _conv_w7); + float _conv_w8 = *(kptr + 8); // 8 * out_elempack + float _conv_w9 = *(kptr + 9); // 9 * out_elempack + _sum += (_val_channel8 * _conv_w8); + _sum += (_val_channel9 * _conv_w9); + float _conv_wa = *(kptr + 10); // 10 * out_elempack + float _conv_wb = *(kptr + 11); // 11 * out_elempack + _sum += (_val_channela * _conv_wa); + _sum += (_val_channelb * _conv_wb); + float _conv_wc = *(kptr + 12); // 12 * out_elempack + float _conv_wd = *(kptr + 13); // 13 * out_elempack + _sum += (_val_channelc * _conv_wc); + _sum += (_val_channeld * _conv_wd); + float _conv_we = *(kptr + 14); // 14 * out_elempack + float _conv_wf = *(kptr + 15); // 15 * out_elempack + _sum += (_val_channele * _conv_we); + _sum += (_val_channelf * _conv_wf); + kptr += wstep; + } + } + } + _sum = activation_ss(_sum, activation_type, activation_params); + *(outptr + h_col * outw + w_col) = _sum; + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack16to4.h b/src/layer/x86/deformableconv2d_pack16to4.h new file mode 100644 index 00000000000..a75e26ec8cf --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack16to4.h @@ -0,0 +1,435 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack16to4_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 16; + const int out_elempack = 4; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m128 _sum = _mm_setzero_ps(); + if (bias_data_ptr) + _sum = _mm_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val_channel0 = _mm_loadu_ps(zeros_ptr); + __m128 _val_channel1 = _val_channel0; + __m128 _val_channel2 = _val_channel0; + __m128 _val_channel3 = _val_channel0; + __m128 _val_channel4 = _val_channel0; + __m128 _val_channel5 = _val_channel0; + __m128 _val_channel6 = _val_channel0; + __m128 _val_channel7 = _val_channel0; + __m128 _val_channel8 = _val_channel0; + __m128 _val_channel9 = _val_channel0; + __m128 _val_channela = _val_channel0; + __m128 _val_channelb = _val_channel0; + __m128 _val_channelc = _val_channel0; + __m128 _val_channeld = _val_channel0; + __m128 _val_channele = _val_channel0; + __m128 _val_channelf = _val_channel0; + if (cond) + { + __m128 _v1_channel0 = _val_channel0; + __m128 _v1_channel1 = _val_channel0; + __m128 _v1_channel2 = _val_channel0; + __m128 _v1_channel3 = _val_channel0; + __m128 _v1_channel4 = _val_channel0; + __m128 _v1_channel5 = _val_channel0; + __m128 _v1_channel6 = _val_channel0; + __m128 _v1_channel7 = _val_channel0; + __m128 _v1_channel8 = _val_channel0; + __m128 _v1_channel9 = _val_channel0; + __m128 _v1_channela = _val_channel0; + __m128 _v1_channelb = _val_channel0; + __m128 _v1_channelc = _val_channel0; + __m128 _v1_channeld = _val_channel0; + __m128 _v1_channele = _val_channel0; + __m128 _v1_channelf = _val_channel0; + __m128 _v2_channel0 = _val_channel0; + __m128 _v2_channel1 = _val_channel0; + __m128 _v2_channel2 = _val_channel0; + __m128 _v2_channel3 = _val_channel0; + __m128 _v2_channel4 = _val_channel0; + __m128 _v2_channel5 = _val_channel0; + __m128 _v2_channel6 = _val_channel0; + __m128 _v2_channel7 = _val_channel0; + __m128 _v2_channel8 = _val_channel0; + __m128 _v2_channel9 = _val_channel0; + __m128 _v2_channela = _val_channel0; + __m128 _v2_channelb = _val_channel0; + __m128 _v2_channelc = _val_channel0; + __m128 _v2_channeld = _val_channel0; + __m128 _v2_channele = _val_channel0; + __m128 _v2_channelf = _val_channel0; + __m128 _v3_channel0 = _val_channel0; + __m128 _v3_channel1 = _val_channel0; + __m128 _v3_channel2 = _val_channel0; + __m128 _v3_channel3 = _val_channel0; + __m128 _v3_channel4 = _val_channel0; + __m128 _v3_channel5 = _val_channel0; + __m128 _v3_channel6 = _val_channel0; + __m128 _v3_channel7 = _val_channel0; + __m128 _v3_channel8 = _val_channel0; + __m128 _v3_channel9 = _val_channel0; + __m128 _v3_channela = _val_channel0; + __m128 _v3_channelb = _val_channel0; + __m128 _v3_channelc = _val_channel0; + __m128 _v3_channeld = _val_channel0; + __m128 _v3_channele = _val_channel0; + __m128 _v3_channelf = _val_channel0; + __m128 _v4_channel0 = _val_channel0; + __m128 _v4_channel1 = _val_channel0; + __m128 _v4_channel2 = _val_channel0; + __m128 _v4_channel3 = _val_channel0; + __m128 _v4_channel4 = _val_channel0; + __m128 _v4_channel5 = _val_channel0; + __m128 _v4_channel6 = _val_channel0; + __m128 _v4_channel7 = _val_channel0; + __m128 _v4_channel8 = _val_channel0; + __m128 _v4_channel9 = _val_channel0; + __m128 _v4_channela = _val_channel0; + __m128 _v4_channelb = _val_channel0; + __m128 _v4_channelc = _val_channel0; + __m128 _v4_channeld = _val_channel0; + __m128 _v4_channele = _val_channel0; + __m128 _v4_channelf = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm_load1_ps(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 7); + _v1_channel8 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 8); + _v1_channel9 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 9); + _v1_channela = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 10); + _v1_channelb = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 11); + _v1_channelc = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 12); + _v1_channeld = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 13); + _v1_channele = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 14); + _v1_channelf = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 15); + } + if (v2_cond) + { + _v2_channel0 = _mm_load1_ps(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 7); + _v2_channel8 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 8); + _v2_channel9 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 9); + _v2_channela = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 10); + _v2_channelb = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 11); + _v2_channelc = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 12); + _v2_channeld = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 13); + _v2_channele = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 14); + _v2_channelf = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 15); + } + if (v3_cond) + { + _v3_channel0 = _mm_load1_ps(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 7); + _v3_channel8 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 8); + _v3_channel9 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 9); + _v3_channela = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 10); + _v3_channelb = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 11); + _v3_channelc = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 12); + _v3_channeld = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 13); + _v3_channele = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 14); + _v3_channelf = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 15); + } + if (v4_cond) + { + _v4_channel0 = _mm_load1_ps(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 7); + _v4_channel8 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 8); + _v4_channel9 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 9); + _v4_channela = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 10); + _v4_channelb = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 11); + _v4_channelc = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 12); + _v4_channeld = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 13); + _v4_channele = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 14); + _v4_channelf = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 15); + } + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val_channel0 = _mm_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm_comp_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm_comp_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm_comp_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm_comp_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v4_channel7, _w4, _val_channel7); + _val_channel8 = _mm_comp_fmadd_ps(_v1_channel8, _w1, _val_channel8); + _val_channel8 = _mm_comp_fmadd_ps(_v2_channel8, _w2, _val_channel8); + _val_channel8 = _mm_comp_fmadd_ps(_v3_channel8, _w3, _val_channel8); + _val_channel8 = _mm_comp_fmadd_ps(_v4_channel8, _w4, _val_channel8); + _val_channel9 = _mm_comp_fmadd_ps(_v1_channel9, _w1, _val_channel9); + _val_channel9 = _mm_comp_fmadd_ps(_v2_channel9, _w2, _val_channel9); + _val_channel9 = _mm_comp_fmadd_ps(_v3_channel9, _w3, _val_channel9); + _val_channel9 = _mm_comp_fmadd_ps(_v4_channel9, _w4, _val_channel9); + _val_channela = _mm_comp_fmadd_ps(_v1_channela, _w1, _val_channela); + _val_channela = _mm_comp_fmadd_ps(_v2_channela, _w2, _val_channela); + _val_channela = _mm_comp_fmadd_ps(_v3_channela, _w3, _val_channela); + _val_channela = _mm_comp_fmadd_ps(_v4_channela, _w4, _val_channela); + _val_channelb = _mm_comp_fmadd_ps(_v1_channelb, _w1, _val_channelb); + _val_channelb = _mm_comp_fmadd_ps(_v2_channelb, _w2, _val_channelb); + _val_channelb = _mm_comp_fmadd_ps(_v3_channelb, _w3, _val_channelb); + _val_channelb = _mm_comp_fmadd_ps(_v4_channelb, _w4, _val_channelb); + _val_channelc = _mm_comp_fmadd_ps(_v1_channelc, _w1, _val_channelc); + _val_channelc = _mm_comp_fmadd_ps(_v2_channelc, _w2, _val_channelc); + _val_channelc = _mm_comp_fmadd_ps(_v3_channelc, _w3, _val_channelc); + _val_channelc = _mm_comp_fmadd_ps(_v4_channelc, _w4, _val_channelc); + _val_channeld = _mm_comp_fmadd_ps(_v1_channeld, _w1, _val_channeld); + _val_channeld = _mm_comp_fmadd_ps(_v2_channeld, _w2, _val_channeld); + _val_channeld = _mm_comp_fmadd_ps(_v3_channeld, _w3, _val_channeld); + _val_channeld = _mm_comp_fmadd_ps(_v4_channeld, _w4, _val_channeld); + _val_channele = _mm_comp_fmadd_ps(_v1_channele, _w1, _val_channele); + _val_channele = _mm_comp_fmadd_ps(_v2_channele, _w2, _val_channele); + _val_channele = _mm_comp_fmadd_ps(_v3_channele, _w3, _val_channele); + _val_channele = _mm_comp_fmadd_ps(_v4_channele, _w4, _val_channele); + _val_channelf = _mm_comp_fmadd_ps(_v1_channelf, _w1, _val_channelf); + _val_channelf = _mm_comp_fmadd_ps(_v2_channelf, _w2, _val_channelf); + _val_channelf = _mm_comp_fmadd_ps(_v3_channelf, _w3, _val_channelf); + _val_channelf = _mm_comp_fmadd_ps(_v4_channelf, _w4, _val_channelf); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val_channel0 = _mm_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm_mul_ps(_val_channel7, _mask); + _val_channel8 = _mm_mul_ps(_val_channel8, _mask); + _val_channel9 = _mm_mul_ps(_val_channel9, _mask); + _val_channela = _mm_mul_ps(_val_channela, _mask); + _val_channelb = _mm_mul_ps(_val_channelb, _mask); + _val_channelc = _mm_mul_ps(_val_channelc, _mask); + _val_channeld = _mm_mul_ps(_val_channeld, _mask); + _val_channele = _mm_mul_ps(_val_channele, _mask); + _val_channelf = _mm_mul_ps(_val_channelf, _mask); + } + __m128 _conv_w0 = _mm_load_ps(kptr); + __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m128 _conv_w2 = _mm_load_ps(kptr + 8); // 2 * out_elempack + __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m128 _conv_w4 = _mm_load_ps(kptr + 16); // 4 * out_elempack + __m128 _conv_w5 = _mm_load_ps(kptr + 20); // 5 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m128 _conv_w6 = _mm_load_ps(kptr + 24); // 6 * out_elempack + __m128 _conv_w7 = _mm_load_ps(kptr + 28); // 7 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel7, _conv_w7, _sum); + __m128 _conv_w8 = _mm_load_ps(kptr + 32); // 8 * out_elempack + __m128 _conv_w9 = _mm_load_ps(kptr + 36); // 9 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel8, _conv_w8, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel9, _conv_w9, _sum); + __m128 _conv_wa = _mm_load_ps(kptr + 40); // 10 * out_elempack + __m128 _conv_wb = _mm_load_ps(kptr + 44); // 11 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channela, _conv_wa, _sum); + _sum = _mm_comp_fmadd_ps(_val_channelb, _conv_wb, _sum); + __m128 _conv_wc = _mm_load_ps(kptr + 48); // 12 * out_elempack + __m128 _conv_wd = _mm_load_ps(kptr + 52); // 13 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channelc, _conv_wc, _sum); + _sum = _mm_comp_fmadd_ps(_val_channeld, _conv_wd, _sum); + __m128 _conv_we = _mm_load_ps(kptr + 56); // 14 * out_elempack + __m128 _conv_wf = _mm_load_ps(kptr + 60); // 15 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channele, _conv_we, _sum); + _sum = _mm_comp_fmadd_ps(_val_channelf, _conv_wf, _sum); + kptr += wstep; + } + } + } + _sum = activation_sse(_sum, activation_type, activation_params); + _mm_storeu_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack16to8.h b/src/layer/x86/deformableconv2d_pack16to8.h new file mode 100644 index 00000000000..f44fc9ad0c8 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack16to8.h @@ -0,0 +1,435 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack16to8_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 16; + const int out_elempack = 8; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m256 _sum = _mm256_setzero_ps(); + if (bias_data_ptr) + _sum = _mm256_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val_channel0 = _mm256_loadu_ps(zeros_ptr); + __m256 _val_channel1 = _val_channel0; + __m256 _val_channel2 = _val_channel0; + __m256 _val_channel3 = _val_channel0; + __m256 _val_channel4 = _val_channel0; + __m256 _val_channel5 = _val_channel0; + __m256 _val_channel6 = _val_channel0; + __m256 _val_channel7 = _val_channel0; + __m256 _val_channel8 = _val_channel0; + __m256 _val_channel9 = _val_channel0; + __m256 _val_channela = _val_channel0; + __m256 _val_channelb = _val_channel0; + __m256 _val_channelc = _val_channel0; + __m256 _val_channeld = _val_channel0; + __m256 _val_channele = _val_channel0; + __m256 _val_channelf = _val_channel0; + if (cond) + { + __m256 _v1_channel0 = _val_channel0; + __m256 _v1_channel1 = _val_channel0; + __m256 _v1_channel2 = _val_channel0; + __m256 _v1_channel3 = _val_channel0; + __m256 _v1_channel4 = _val_channel0; + __m256 _v1_channel5 = _val_channel0; + __m256 _v1_channel6 = _val_channel0; + __m256 _v1_channel7 = _val_channel0; + __m256 _v1_channel8 = _val_channel0; + __m256 _v1_channel9 = _val_channel0; + __m256 _v1_channela = _val_channel0; + __m256 _v1_channelb = _val_channel0; + __m256 _v1_channelc = _val_channel0; + __m256 _v1_channeld = _val_channel0; + __m256 _v1_channele = _val_channel0; + __m256 _v1_channelf = _val_channel0; + __m256 _v2_channel0 = _val_channel0; + __m256 _v2_channel1 = _val_channel0; + __m256 _v2_channel2 = _val_channel0; + __m256 _v2_channel3 = _val_channel0; + __m256 _v2_channel4 = _val_channel0; + __m256 _v2_channel5 = _val_channel0; + __m256 _v2_channel6 = _val_channel0; + __m256 _v2_channel7 = _val_channel0; + __m256 _v2_channel8 = _val_channel0; + __m256 _v2_channel9 = _val_channel0; + __m256 _v2_channela = _val_channel0; + __m256 _v2_channelb = _val_channel0; + __m256 _v2_channelc = _val_channel0; + __m256 _v2_channeld = _val_channel0; + __m256 _v2_channele = _val_channel0; + __m256 _v2_channelf = _val_channel0; + __m256 _v3_channel0 = _val_channel0; + __m256 _v3_channel1 = _val_channel0; + __m256 _v3_channel2 = _val_channel0; + __m256 _v3_channel3 = _val_channel0; + __m256 _v3_channel4 = _val_channel0; + __m256 _v3_channel5 = _val_channel0; + __m256 _v3_channel6 = _val_channel0; + __m256 _v3_channel7 = _val_channel0; + __m256 _v3_channel8 = _val_channel0; + __m256 _v3_channel9 = _val_channel0; + __m256 _v3_channela = _val_channel0; + __m256 _v3_channelb = _val_channel0; + __m256 _v3_channelc = _val_channel0; + __m256 _v3_channeld = _val_channel0; + __m256 _v3_channele = _val_channel0; + __m256 _v3_channelf = _val_channel0; + __m256 _v4_channel0 = _val_channel0; + __m256 _v4_channel1 = _val_channel0; + __m256 _v4_channel2 = _val_channel0; + __m256 _v4_channel3 = _val_channel0; + __m256 _v4_channel4 = _val_channel0; + __m256 _v4_channel5 = _val_channel0; + __m256 _v4_channel6 = _val_channel0; + __m256 _v4_channel7 = _val_channel0; + __m256 _v4_channel8 = _val_channel0; + __m256 _v4_channel9 = _val_channel0; + __m256 _v4_channela = _val_channel0; + __m256 _v4_channelb = _val_channel0; + __m256 _v4_channelc = _val_channel0; + __m256 _v4_channeld = _val_channel0; + __m256 _v4_channele = _val_channel0; + __m256 _v4_channelf = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 7); + _v1_channel8 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 8); + _v1_channel9 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 9); + _v1_channela = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 10); + _v1_channelb = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 11); + _v1_channelc = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 12); + _v1_channeld = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 13); + _v1_channele = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 14); + _v1_channelf = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 15); + } + if (v2_cond) + { + _v2_channel0 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 7); + _v2_channel8 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 8); + _v2_channel9 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 9); + _v2_channela = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 10); + _v2_channelb = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 11); + _v2_channelc = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 12); + _v2_channeld = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 13); + _v2_channele = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 14); + _v2_channelf = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 15); + } + if (v3_cond) + { + _v3_channel0 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 7); + _v3_channel8 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 8); + _v3_channel9 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 9); + _v3_channela = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 10); + _v3_channelb = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 11); + _v3_channelc = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 12); + _v3_channeld = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 13); + _v3_channele = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 14); + _v3_channelf = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 15); + } + if (v4_cond) + { + _v4_channel0 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 7); + _v4_channel8 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 8); + _v4_channel9 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 9); + _v4_channela = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 10); + _v4_channelb = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 11); + _v4_channelc = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 12); + _v4_channeld = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 13); + _v4_channele = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 14); + _v4_channelf = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 15); + } + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val_channel0 = _mm256_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm256_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm256_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm256_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm256_comp_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm256_comp_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm256_comp_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm256_comp_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v4_channel7, _w4, _val_channel7); + _val_channel8 = _mm256_comp_fmadd_ps(_v1_channel8, _w1, _val_channel8); + _val_channel8 = _mm256_comp_fmadd_ps(_v2_channel8, _w2, _val_channel8); + _val_channel8 = _mm256_comp_fmadd_ps(_v3_channel8, _w3, _val_channel8); + _val_channel8 = _mm256_comp_fmadd_ps(_v4_channel8, _w4, _val_channel8); + _val_channel9 = _mm256_comp_fmadd_ps(_v1_channel9, _w1, _val_channel9); + _val_channel9 = _mm256_comp_fmadd_ps(_v2_channel9, _w2, _val_channel9); + _val_channel9 = _mm256_comp_fmadd_ps(_v3_channel9, _w3, _val_channel9); + _val_channel9 = _mm256_comp_fmadd_ps(_v4_channel9, _w4, _val_channel9); + _val_channela = _mm256_comp_fmadd_ps(_v1_channela, _w1, _val_channela); + _val_channela = _mm256_comp_fmadd_ps(_v2_channela, _w2, _val_channela); + _val_channela = _mm256_comp_fmadd_ps(_v3_channela, _w3, _val_channela); + _val_channela = _mm256_comp_fmadd_ps(_v4_channela, _w4, _val_channela); + _val_channelb = _mm256_comp_fmadd_ps(_v1_channelb, _w1, _val_channelb); + _val_channelb = _mm256_comp_fmadd_ps(_v2_channelb, _w2, _val_channelb); + _val_channelb = _mm256_comp_fmadd_ps(_v3_channelb, _w3, _val_channelb); + _val_channelb = _mm256_comp_fmadd_ps(_v4_channelb, _w4, _val_channelb); + _val_channelc = _mm256_comp_fmadd_ps(_v1_channelc, _w1, _val_channelc); + _val_channelc = _mm256_comp_fmadd_ps(_v2_channelc, _w2, _val_channelc); + _val_channelc = _mm256_comp_fmadd_ps(_v3_channelc, _w3, _val_channelc); + _val_channelc = _mm256_comp_fmadd_ps(_v4_channelc, _w4, _val_channelc); + _val_channeld = _mm256_comp_fmadd_ps(_v1_channeld, _w1, _val_channeld); + _val_channeld = _mm256_comp_fmadd_ps(_v2_channeld, _w2, _val_channeld); + _val_channeld = _mm256_comp_fmadd_ps(_v3_channeld, _w3, _val_channeld); + _val_channeld = _mm256_comp_fmadd_ps(_v4_channeld, _w4, _val_channeld); + _val_channele = _mm256_comp_fmadd_ps(_v1_channele, _w1, _val_channele); + _val_channele = _mm256_comp_fmadd_ps(_v2_channele, _w2, _val_channele); + _val_channele = _mm256_comp_fmadd_ps(_v3_channele, _w3, _val_channele); + _val_channele = _mm256_comp_fmadd_ps(_v4_channele, _w4, _val_channele); + _val_channelf = _mm256_comp_fmadd_ps(_v1_channelf, _w1, _val_channelf); + _val_channelf = _mm256_comp_fmadd_ps(_v2_channelf, _w2, _val_channelf); + _val_channelf = _mm256_comp_fmadd_ps(_v3_channelf, _w3, _val_channelf); + _val_channelf = _mm256_comp_fmadd_ps(_v4_channelf, _w4, _val_channelf); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val_channel0 = _mm256_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm256_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm256_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm256_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm256_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm256_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm256_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm256_mul_ps(_val_channel7, _mask); + _val_channel8 = _mm256_mul_ps(_val_channel8, _mask); + _val_channel9 = _mm256_mul_ps(_val_channel9, _mask); + _val_channela = _mm256_mul_ps(_val_channela, _mask); + _val_channelb = _mm256_mul_ps(_val_channelb, _mask); + _val_channelc = _mm256_mul_ps(_val_channelc, _mask); + _val_channeld = _mm256_mul_ps(_val_channeld, _mask); + _val_channele = _mm256_mul_ps(_val_channele, _mask); + _val_channelf = _mm256_mul_ps(_val_channelf, _mask); + } + __m256 _conv_w0 = _mm256_load_ps(kptr); + __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack + __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m256 _conv_w4 = _mm256_load_ps(kptr + 32); // 4 * out_elempack + __m256 _conv_w5 = _mm256_load_ps(kptr + 40); // 5 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m256 _conv_w6 = _mm256_load_ps(kptr + 48); // 6 * out_elempack + __m256 _conv_w7 = _mm256_load_ps(kptr + 56); // 7 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel7, _conv_w7, _sum); + __m256 _conv_w8 = _mm256_load_ps(kptr + 64); // 8 * out_elempack + __m256 _conv_w9 = _mm256_load_ps(kptr + 72); // 9 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel8, _conv_w8, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel9, _conv_w9, _sum); + __m256 _conv_wa = _mm256_load_ps(kptr + 80); // 10 * out_elempack + __m256 _conv_wb = _mm256_load_ps(kptr + 88); // 11 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channela, _conv_wa, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channelb, _conv_wb, _sum); + __m256 _conv_wc = _mm256_load_ps(kptr + 96); // 12 * out_elempack + __m256 _conv_wd = _mm256_load_ps(kptr + 104); // 13 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channelc, _conv_wc, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channeld, _conv_wd, _sum); + __m256 _conv_we = _mm256_load_ps(kptr + 112); // 14 * out_elempack + __m256 _conv_wf = _mm256_load_ps(kptr + 120); // 15 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channele, _conv_we, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channelf, _conv_wf, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx(_sum, activation_type, activation_params); + _mm256_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack1to16.h b/src/layer/x86/deformableconv2d_pack1to16.h new file mode 100644 index 00000000000..b50e787e9c8 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack1to16.h @@ -0,0 +1,195 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack1to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 1; + const int out_elempack = 16; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m512 _sum = _mm512_setzero_ps(); + if (bias_data_ptr) + _sum = _mm512_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val_channel0 = _mm512_loadu_ps(zeros_ptr); + if (cond) + { + __m512 _v1_channel0 = _val_channel0; + __m512 _v2_channel0 = _val_channel0; + __m512 _v3_channel0 = _val_channel0; + __m512 _v4_channel0 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm512_set1_ps(data_im_ptr[v1_pos]); + } + if (v2_cond) + { + _v2_channel0 = _mm512_set1_ps(data_im_ptr[v2_pos]); + } + if (v3_cond) + { + _v3_channel0 = _mm512_set1_ps(data_im_ptr[v3_pos]); + } + if (v4_cond) + { + _v4_channel0 = _mm512_set1_ps(data_im_ptr[v4_pos]); + } + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val_channel0 = _mm512_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v4_channel0, _w4, _val_channel0); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val_channel0 = _mm512_mul_ps(_val_channel0, _mask); + } + __m512 _conv_w0 = _mm512_load_ps(kptr); + _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx512(_sum, activation_type, activation_params); + _mm512_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack1to4.h b/src/layer/x86/deformableconv2d_pack1to4.h new file mode 100644 index 00000000000..0388111306f --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack1to4.h @@ -0,0 +1,195 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack1to4_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 1; + const int out_elempack = 4; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m128 _sum = _mm_setzero_ps(); + if (bias_data_ptr) + _sum = _mm_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val_channel0 = _mm_loadu_ps(zeros_ptr); + if (cond) + { + __m128 _v1_channel0 = _val_channel0; + __m128 _v2_channel0 = _val_channel0; + __m128 _v3_channel0 = _val_channel0; + __m128 _v4_channel0 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm_load1_ps(data_im_ptr + v1_pos); + } + if (v2_cond) + { + _v2_channel0 = _mm_load1_ps(data_im_ptr + v2_pos); + } + if (v3_cond) + { + _v3_channel0 = _mm_load1_ps(data_im_ptr + v3_pos); + } + if (v4_cond) + { + _v4_channel0 = _mm_load1_ps(data_im_ptr + v4_pos); + } + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val_channel0 = _mm_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val_channel0 = _mm_mul_ps(_val_channel0, _mask); + } + __m128 _conv_w0 = _mm_load_ps(kptr); + _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + kptr += wstep; + } + } + } + _sum = activation_sse(_sum, activation_type, activation_params); + _mm_storeu_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack1to8.h b/src/layer/x86/deformableconv2d_pack1to8.h new file mode 100644 index 00000000000..fe1e0c8c0a6 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack1to8.h @@ -0,0 +1,195 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack1to8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 1; + const int out_elempack = 8; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m256 _sum = _mm256_setzero_ps(); + if (bias_data_ptr) + _sum = _mm256_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val_channel0 = _mm256_loadu_ps(zeros_ptr); + if (cond) + { + __m256 _v1_channel0 = _val_channel0; + __m256 _v2_channel0 = _val_channel0; + __m256 _v3_channel0 = _val_channel0; + __m256 _v4_channel0 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm256_broadcast_ss(data_im_ptr + v1_pos); + } + if (v2_cond) + { + _v2_channel0 = _mm256_broadcast_ss(data_im_ptr + v2_pos); + } + if (v3_cond) + { + _v3_channel0 = _mm256_broadcast_ss(data_im_ptr + v3_pos); + } + if (v4_cond) + { + _v4_channel0 = _mm256_broadcast_ss(data_im_ptr + v4_pos); + } + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val_channel0 = _mm256_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val_channel0 = _mm256_mul_ps(_val_channel0, _mask); + } + __m256 _conv_w0 = _mm256_load_ps(kptr); + _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx(_sum, activation_type, activation_params); + _mm256_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack4.h b/src/layer/x86/deformableconv2d_pack4.h new file mode 100644 index 00000000000..32b27963fb1 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack4.h @@ -0,0 +1,243 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack4_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 4; + const int out_elempack = 4; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m128 _sum = _mm_setzero_ps(); + if (bias_data_ptr) + _sum = _mm_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val_channel0 = _mm_loadu_ps(zeros_ptr); + __m128 _val_channel1 = _val_channel0; + __m128 _val_channel2 = _val_channel0; + __m128 _val_channel3 = _val_channel0; + if (cond) + { + __m128 _v1_channel0 = _val_channel0; + __m128 _v1_channel1 = _val_channel0; + __m128 _v1_channel2 = _val_channel0; + __m128 _v1_channel3 = _val_channel0; + __m128 _v2_channel0 = _val_channel0; + __m128 _v2_channel1 = _val_channel0; + __m128 _v2_channel2 = _val_channel0; + __m128 _v2_channel3 = _val_channel0; + __m128 _v3_channel0 = _val_channel0; + __m128 _v3_channel1 = _val_channel0; + __m128 _v3_channel2 = _val_channel0; + __m128 _v3_channel3 = _val_channel0; + __m128 _v4_channel0 = _val_channel0; + __m128 _v4_channel1 = _val_channel0; + __m128 _v4_channel2 = _val_channel0; + __m128 _v4_channel3 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm_load1_ps(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 3); + } + if (v2_cond) + { + _v2_channel0 = _mm_load1_ps(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 3); + } + if (v3_cond) + { + _v3_channel0 = _mm_load1_ps(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 3); + } + if (v4_cond) + { + _v4_channel0 = _mm_load1_ps(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 3); + } + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val_channel0 = _mm_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val_channel0 = _mm_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm_mul_ps(_val_channel3, _mask); + } + __m128 _conv_w0 = _mm_load_ps(kptr); + __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m128 _conv_w2 = _mm_load_ps(kptr + 8); // 2 * out_elempack + __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + kptr += wstep; + } + } + } + _sum = activation_sse(_sum, activation_type, activation_params); + _mm_storeu_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack4to1.h b/src/layer/x86/deformableconv2d_pack4to1.h new file mode 100644 index 00000000000..7ee073a91cb --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack4to1.h @@ -0,0 +1,211 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack4to1_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 4; + const int out_elempack = 1; + const int wstep = out_elempack * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + float _sum = 0.f; + if (bias_data_ptr) + _sum = *(bias_data_ptr + oc); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + float _val_channel0 = 0.f; + float _val_channel1 = _val_channel0; + float _val_channel2 = _val_channel0; + float _val_channel3 = _val_channel0; + if (cond) + { + float _v1_channel0 = _val_channel0; + float _v1_channel1 = _val_channel0; + float _v1_channel2 = _val_channel0; + float _v1_channel3 = _val_channel0; + float _v2_channel0 = _val_channel0; + float _v2_channel1 = _val_channel0; + float _v2_channel2 = _val_channel0; + float _v2_channel3 = _val_channel0; + float _v3_channel0 = _val_channel0; + float _v3_channel1 = _val_channel0; + float _v3_channel2 = _val_channel0; + float _v3_channel3 = _val_channel0; + float _v4_channel0 = _val_channel0; + float _v4_channel1 = _val_channel0; + float _v4_channel2 = _val_channel0; + float _v4_channel3 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = *(data_im_ptr + v1_pos * elempack); + _v1_channel1 = *(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = *(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = *(data_im_ptr + v1_pos * elempack + 3); + } + if (v2_cond) + { + _v2_channel0 = *(data_im_ptr + v2_pos * elempack); + _v2_channel1 = *(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = *(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = *(data_im_ptr + v2_pos * elempack + 3); + } + if (v3_cond) + { + _v3_channel0 = *(data_im_ptr + v3_pos * elempack); + _v3_channel1 = *(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = *(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = *(data_im_ptr + v3_pos * elempack + 3); + } + if (v4_cond) + { + _v4_channel0 = *(data_im_ptr + v4_pos * elempack); + _v4_channel1 = *(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = *(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = *(data_im_ptr + v4_pos * elempack + 3); + } + _val_channel0 = w1 * _v1_channel0 + w2 * _v2_channel0 + w3 * _v3_channel0 + w4 * _v4_channel0; + _val_channel1 = w1 * _v1_channel1 + w2 * _v2_channel1 + w3 * _v3_channel1 + w4 * _v4_channel1; + _val_channel2 = w1 * _v1_channel2 + w2 * _v2_channel2 + w3 * _v3_channel2 + w4 * _v4_channel2; + _val_channel3 = w1 * _v1_channel3 + w2 * _v2_channel3 + w3 * _v3_channel3 + w4 * _v4_channel3; + } + if (has_mask) + { + _val_channel0 *= mask_; + _val_channel1 *= mask_; + _val_channel2 *= mask_; + _val_channel3 *= mask_; + } + float _conv_w0 = *(kptr); + float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack + float _conv_w2 = *(kptr + 2); // 2 * out_elempack + float _conv_w3 = *(kptr + 3); // 3 * out_elempack + _sum += (_val_channel0 * _conv_w0 + _val_channel1 * _conv_w1 + _val_channel2 * _conv_w2 + _val_channel3 * _conv_w3); + kptr += wstep; + } + } + } + _sum = activation_ss(_sum, activation_type, activation_params); + *(outptr + h_col * outw + w_col) = _sum; + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack4to16.h b/src/layer/x86/deformableconv2d_pack4to16.h new file mode 100644 index 00000000000..809bb7cb2b5 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack4to16.h @@ -0,0 +1,243 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack4to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 4; + const int out_elempack = 16; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m512 _sum = _mm512_setzero_ps(); + if (bias_data_ptr) + _sum = _mm512_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val_channel0 = _mm512_loadu_ps(zeros_ptr); + __m512 _val_channel1 = _val_channel0; + __m512 _val_channel2 = _val_channel0; + __m512 _val_channel3 = _val_channel0; + if (cond) + { + __m512 _v1_channel0 = _val_channel0; + __m512 _v1_channel1 = _val_channel0; + __m512 _v1_channel2 = _val_channel0; + __m512 _v1_channel3 = _val_channel0; + __m512 _v2_channel0 = _val_channel0; + __m512 _v2_channel1 = _val_channel0; + __m512 _v2_channel2 = _val_channel0; + __m512 _v2_channel3 = _val_channel0; + __m512 _v3_channel0 = _val_channel0; + __m512 _v3_channel1 = _val_channel0; + __m512 _v3_channel2 = _val_channel0; + __m512 _v3_channel3 = _val_channel0; + __m512 _v4_channel0 = _val_channel0; + __m512 _v4_channel1 = _val_channel0; + __m512 _v4_channel2 = _val_channel0; + __m512 _v4_channel3 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack]); + _v1_channel1 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 1]); + _v1_channel2 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 2]); + _v1_channel3 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 3]); + } + if (v2_cond) + { + _v2_channel0 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack]); + _v2_channel1 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 1]); + _v2_channel2 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 2]); + _v2_channel3 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 3]); + } + if (v3_cond) + { + _v3_channel0 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack]); + _v3_channel1 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 1]); + _v3_channel2 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 2]); + _v3_channel3 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 3]); + } + if (v4_cond) + { + _v4_channel0 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack]); + _v4_channel1 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 1]); + _v4_channel2 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 2]); + _v4_channel3 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 3]); + } + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val_channel0 = _mm512_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm512_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm512_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm512_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v4_channel3, _w4, _val_channel3); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val_channel0 = _mm512_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm512_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm512_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm512_mul_ps(_val_channel3, _mask); + } + __m512 _conv_w0 = _mm512_load_ps(kptr); + __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm512_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack + __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm512_fmadd_ps(_val_channel3, _conv_w3, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx512(_sum, activation_type, activation_params); + _mm512_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack4to8.h b/src/layer/x86/deformableconv2d_pack4to8.h new file mode 100644 index 00000000000..84099691826 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack4to8.h @@ -0,0 +1,243 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack4to8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 4; + const int out_elempack = 8; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m256 _sum = _mm256_setzero_ps(); + if (bias_data_ptr) + _sum = _mm256_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val_channel0 = _mm256_loadu_ps(zeros_ptr); + __m256 _val_channel1 = _val_channel0; + __m256 _val_channel2 = _val_channel0; + __m256 _val_channel3 = _val_channel0; + if (cond) + { + __m256 _v1_channel0 = _val_channel0; + __m256 _v1_channel1 = _val_channel0; + __m256 _v1_channel2 = _val_channel0; + __m256 _v1_channel3 = _val_channel0; + __m256 _v2_channel0 = _val_channel0; + __m256 _v2_channel1 = _val_channel0; + __m256 _v2_channel2 = _val_channel0; + __m256 _v2_channel3 = _val_channel0; + __m256 _v3_channel0 = _val_channel0; + __m256 _v3_channel1 = _val_channel0; + __m256 _v3_channel2 = _val_channel0; + __m256 _v3_channel3 = _val_channel0; + __m256 _v4_channel0 = _val_channel0; + __m256 _v4_channel1 = _val_channel0; + __m256 _v4_channel2 = _val_channel0; + __m256 _v4_channel3 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 3); + } + if (v2_cond) + { + _v2_channel0 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 3); + } + if (v3_cond) + { + _v3_channel0 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 3); + } + if (v4_cond) + { + _v4_channel0 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 3); + } + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val_channel0 = _mm256_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm256_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm256_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm256_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val_channel0 = _mm256_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm256_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm256_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm256_mul_ps(_val_channel3, _mask); + } + __m256 _conv_w0 = _mm256_load_ps(kptr); + __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack + __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx(_sum, activation_type, activation_params); + _mm256_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack8.h b/src/layer/x86/deformableconv2d_pack8.h new file mode 100644 index 00000000000..277817e3948 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack8.h @@ -0,0 +1,307 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 8; + const int out_elempack = 8; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m256 _sum = _mm256_setzero_ps(); + if (bias_data_ptr) + _sum = _mm256_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val_channel0 = _mm256_loadu_ps(zeros_ptr); + __m256 _val_channel1 = _val_channel0; + __m256 _val_channel2 = _val_channel0; + __m256 _val_channel3 = _val_channel0; + __m256 _val_channel4 = _val_channel0; + __m256 _val_channel5 = _val_channel0; + __m256 _val_channel6 = _val_channel0; + __m256 _val_channel7 = _val_channel0; + if (cond) + { + __m256 _v1_channel0 = _val_channel0; + __m256 _v1_channel1 = _val_channel0; + __m256 _v1_channel2 = _val_channel0; + __m256 _v1_channel3 = _val_channel0; + __m256 _v1_channel4 = _val_channel0; + __m256 _v1_channel5 = _val_channel0; + __m256 _v1_channel6 = _val_channel0; + __m256 _v1_channel7 = _val_channel0; + __m256 _v2_channel0 = _val_channel0; + __m256 _v2_channel1 = _val_channel0; + __m256 _v2_channel2 = _val_channel0; + __m256 _v2_channel3 = _val_channel0; + __m256 _v2_channel4 = _val_channel0; + __m256 _v2_channel5 = _val_channel0; + __m256 _v2_channel6 = _val_channel0; + __m256 _v2_channel7 = _val_channel0; + __m256 _v3_channel0 = _val_channel0; + __m256 _v3_channel1 = _val_channel0; + __m256 _v3_channel2 = _val_channel0; + __m256 _v3_channel3 = _val_channel0; + __m256 _v3_channel4 = _val_channel0; + __m256 _v3_channel5 = _val_channel0; + __m256 _v3_channel6 = _val_channel0; + __m256 _v3_channel7 = _val_channel0; + __m256 _v4_channel0 = _val_channel0; + __m256 _v4_channel1 = _val_channel0; + __m256 _v4_channel2 = _val_channel0; + __m256 _v4_channel3 = _val_channel0; + __m256 _v4_channel4 = _val_channel0; + __m256 _v4_channel5 = _val_channel0; + __m256 _v4_channel6 = _val_channel0; + __m256 _v4_channel7 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 7); + } + if (v2_cond) + { + _v2_channel0 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 7); + } + if (v3_cond) + { + _v3_channel0 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 7); + } + if (v4_cond) + { + _v4_channel0 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 7); + } + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val_channel0 = _mm256_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm256_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm256_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm256_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm256_comp_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm256_comp_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm256_comp_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm256_comp_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v4_channel7, _w4, _val_channel7); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val_channel0 = _mm256_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm256_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm256_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm256_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm256_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm256_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm256_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm256_mul_ps(_val_channel7, _mask); + } + __m256 _conv_w0 = _mm256_load_ps(kptr); + __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack + __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m256 _conv_w4 = _mm256_load_ps(kptr + 32); // 4 * out_elempack + __m256 _conv_w5 = _mm256_load_ps(kptr + 40); // 5 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m256 _conv_w6 = _mm256_load_ps(kptr + 48); // 6 * out_elempack + __m256 _conv_w7 = _mm256_load_ps(kptr + 56); // 7 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel7, _conv_w7, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx(_sum, activation_type, activation_params); + _mm256_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack8to1.h b/src/layer/x86/deformableconv2d_pack8to1.h new file mode 100644 index 00000000000..c4b97b40f06 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack8to1.h @@ -0,0 +1,259 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack8to1_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 8; + const int out_elempack = 1; + const int wstep = out_elempack * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + float _sum = 0.f; + if (bias_data_ptr) + _sum = *(bias_data_ptr + oc); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + float _val_channel0 = 0.f; + float _val_channel1 = _val_channel0; + float _val_channel2 = _val_channel0; + float _val_channel3 = _val_channel0; + float _val_channel4 = _val_channel0; + float _val_channel5 = _val_channel0; + float _val_channel6 = _val_channel0; + float _val_channel7 = _val_channel0; + if (cond) + { + float _v1_channel0 = _val_channel0; + float _v1_channel1 = _val_channel0; + float _v1_channel2 = _val_channel0; + float _v1_channel3 = _val_channel0; + float _v1_channel4 = _val_channel0; + float _v1_channel5 = _val_channel0; + float _v1_channel6 = _val_channel0; + float _v1_channel7 = _val_channel0; + float _v2_channel0 = _val_channel0; + float _v2_channel1 = _val_channel0; + float _v2_channel2 = _val_channel0; + float _v2_channel3 = _val_channel0; + float _v2_channel4 = _val_channel0; + float _v2_channel5 = _val_channel0; + float _v2_channel6 = _val_channel0; + float _v2_channel7 = _val_channel0; + float _v3_channel0 = _val_channel0; + float _v3_channel1 = _val_channel0; + float _v3_channel2 = _val_channel0; + float _v3_channel3 = _val_channel0; + float _v3_channel4 = _val_channel0; + float _v3_channel5 = _val_channel0; + float _v3_channel6 = _val_channel0; + float _v3_channel7 = _val_channel0; + float _v4_channel0 = _val_channel0; + float _v4_channel1 = _val_channel0; + float _v4_channel2 = _val_channel0; + float _v4_channel3 = _val_channel0; + float _v4_channel4 = _val_channel0; + float _v4_channel5 = _val_channel0; + float _v4_channel6 = _val_channel0; + float _v4_channel7 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = *(data_im_ptr + v1_pos * elempack); + _v1_channel1 = *(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = *(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = *(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = *(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = *(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = *(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = *(data_im_ptr + v1_pos * elempack + 7); + } + if (v2_cond) + { + _v2_channel0 = *(data_im_ptr + v2_pos * elempack); + _v2_channel1 = *(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = *(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = *(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = *(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = *(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = *(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = *(data_im_ptr + v2_pos * elempack + 7); + } + if (v3_cond) + { + _v3_channel0 = *(data_im_ptr + v3_pos * elempack); + _v3_channel1 = *(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = *(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = *(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = *(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = *(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = *(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = *(data_im_ptr + v3_pos * elempack + 7); + } + if (v4_cond) + { + _v4_channel0 = *(data_im_ptr + v4_pos * elempack); + _v4_channel1 = *(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = *(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = *(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = *(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = *(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = *(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = *(data_im_ptr + v4_pos * elempack + 7); + } + _val_channel0 = w1 * _v1_channel0 + w2 * _v2_channel0 + w3 * _v3_channel0 + w4 * _v4_channel0; + _val_channel1 = w1 * _v1_channel1 + w2 * _v2_channel1 + w3 * _v3_channel1 + w4 * _v4_channel1; + _val_channel2 = w1 * _v1_channel2 + w2 * _v2_channel2 + w3 * _v3_channel2 + w4 * _v4_channel2; + _val_channel3 = w1 * _v1_channel3 + w2 * _v2_channel3 + w3 * _v3_channel3 + w4 * _v4_channel3; + _val_channel4 = w1 * _v1_channel4 + w2 * _v2_channel4 + w3 * _v3_channel4 + w4 * _v4_channel4; + _val_channel5 = w1 * _v1_channel5 + w2 * _v2_channel5 + w3 * _v3_channel5 + w4 * _v4_channel5; + _val_channel6 = w1 * _v1_channel6 + w2 * _v2_channel6 + w3 * _v3_channel6 + w4 * _v4_channel6; + _val_channel7 = w1 * _v1_channel7 + w2 * _v2_channel7 + w3 * _v3_channel7 + w4 * _v4_channel7; + } + if (has_mask) + { + _val_channel0 *= mask_; + _val_channel1 *= mask_; + _val_channel2 *= mask_; + _val_channel3 *= mask_; + _val_channel4 *= mask_; + _val_channel5 *= mask_; + _val_channel6 *= mask_; + _val_channel7 *= mask_; + } + float _conv_w0 = *(kptr); + float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack + float _conv_w2 = *(kptr + 2); // 2 * out_elempack + float _conv_w3 = *(kptr + 3); // 3 * out_elempack + float _conv_w4 = *(kptr + 4); // 4 * out_elempack + float _conv_w5 = *(kptr + 5); // 5 * out_elempack + float _conv_w6 = *(kptr + 6); // 6 * out_elempack + float _conv_w7 = *(kptr + 7); // 7 * out_elempack + _sum += (_val_channel0 * _conv_w0 + _val_channel1 * _conv_w1 + _val_channel2 * _conv_w2 + _val_channel3 * _conv_w3 + _val_channel4 * _conv_w4 + _val_channel5 * _conv_w5 + _val_channel6 * _conv_w6 + _val_channel7 * _conv_w7); + kptr += wstep; + } + } + } + _sum = activation_ss(_sum, activation_type, activation_params); + *(outptr + h_col * outw + w_col) = _sum; + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack8to16.h b/src/layer/x86/deformableconv2d_pack8to16.h new file mode 100644 index 00000000000..15e5ed076e6 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack8to16.h @@ -0,0 +1,307 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack8to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 8; + const int out_elempack = 16; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m512 _sum = _mm512_setzero_ps(); + if (bias_data_ptr) + _sum = _mm512_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val_channel0 = _mm512_loadu_ps(zeros_ptr); + __m512 _val_channel1 = _val_channel0; + __m512 _val_channel2 = _val_channel0; + __m512 _val_channel3 = _val_channel0; + __m512 _val_channel4 = _val_channel0; + __m512 _val_channel5 = _val_channel0; + __m512 _val_channel6 = _val_channel0; + __m512 _val_channel7 = _val_channel0; + if (cond) + { + __m512 _v1_channel0 = _val_channel0; + __m512 _v1_channel1 = _val_channel0; + __m512 _v1_channel2 = _val_channel0; + __m512 _v1_channel3 = _val_channel0; + __m512 _v1_channel4 = _val_channel0; + __m512 _v1_channel5 = _val_channel0; + __m512 _v1_channel6 = _val_channel0; + __m512 _v1_channel7 = _val_channel0; + __m512 _v2_channel0 = _val_channel0; + __m512 _v2_channel1 = _val_channel0; + __m512 _v2_channel2 = _val_channel0; + __m512 _v2_channel3 = _val_channel0; + __m512 _v2_channel4 = _val_channel0; + __m512 _v2_channel5 = _val_channel0; + __m512 _v2_channel6 = _val_channel0; + __m512 _v2_channel7 = _val_channel0; + __m512 _v3_channel0 = _val_channel0; + __m512 _v3_channel1 = _val_channel0; + __m512 _v3_channel2 = _val_channel0; + __m512 _v3_channel3 = _val_channel0; + __m512 _v3_channel4 = _val_channel0; + __m512 _v3_channel5 = _val_channel0; + __m512 _v3_channel6 = _val_channel0; + __m512 _v3_channel7 = _val_channel0; + __m512 _v4_channel0 = _val_channel0; + __m512 _v4_channel1 = _val_channel0; + __m512 _v4_channel2 = _val_channel0; + __m512 _v4_channel3 = _val_channel0; + __m512 _v4_channel4 = _val_channel0; + __m512 _v4_channel5 = _val_channel0; + __m512 _v4_channel6 = _val_channel0; + __m512 _v4_channel7 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack]); + _v1_channel1 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 1]); + _v1_channel2 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 2]); + _v1_channel3 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 3]); + _v1_channel4 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 4]); + _v1_channel5 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 5]); + _v1_channel6 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 6]); + _v1_channel7 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 7]); + } + if (v2_cond) + { + _v2_channel0 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack]); + _v2_channel1 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 1]); + _v2_channel2 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 2]); + _v2_channel3 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 3]); + _v2_channel4 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 4]); + _v2_channel5 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 5]); + _v2_channel6 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 6]); + _v2_channel7 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 7]); + } + if (v3_cond) + { + _v3_channel0 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack]); + _v3_channel1 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 1]); + _v3_channel2 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 2]); + _v3_channel3 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 3]); + _v3_channel4 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 4]); + _v3_channel5 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 5]); + _v3_channel6 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 6]); + _v3_channel7 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 7]); + } + if (v4_cond) + { + _v4_channel0 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack]); + _v4_channel1 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 1]); + _v4_channel2 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 2]); + _v4_channel3 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 3]); + _v4_channel4 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 4]); + _v4_channel5 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 5]); + _v4_channel6 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 6]); + _v4_channel7 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 7]); + } + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val_channel0 = _mm512_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm512_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm512_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm512_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm512_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm512_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm512_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm512_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v4_channel7, _w4, _val_channel7); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val_channel0 = _mm512_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm512_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm512_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm512_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm512_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm512_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm512_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm512_mul_ps(_val_channel7, _mask); + } + __m512 _conv_w0 = _mm512_load_ps(kptr); + __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm512_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack + __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm512_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m512 _conv_w4 = _mm512_load_ps(kptr + 64); // 4 * out_elempack + __m512 _conv_w5 = _mm512_load_ps(kptr + 80); // 5 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm512_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m512 _conv_w6 = _mm512_load_ps(kptr + 96); // 6 * out_elempack + __m512 _conv_w7 = _mm512_load_ps(kptr + 112); // 7 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm512_fmadd_ps(_val_channel7, _conv_w7, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx512(_sum, activation_type, activation_params); + _mm512_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack8to4.h b/src/layer/x86/deformableconv2d_pack8to4.h new file mode 100644 index 00000000000..85aa06aaa03 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack8to4.h @@ -0,0 +1,307 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack8to4_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 8; + const int out_elempack = 4; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m128 _sum = _mm_setzero_ps(); + if (bias_data_ptr) + _sum = _mm_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val_channel0 = _mm_loadu_ps(zeros_ptr); + __m128 _val_channel1 = _val_channel0; + __m128 _val_channel2 = _val_channel0; + __m128 _val_channel3 = _val_channel0; + __m128 _val_channel4 = _val_channel0; + __m128 _val_channel5 = _val_channel0; + __m128 _val_channel6 = _val_channel0; + __m128 _val_channel7 = _val_channel0; + if (cond) + { + __m128 _v1_channel0 = _val_channel0; + __m128 _v1_channel1 = _val_channel0; + __m128 _v1_channel2 = _val_channel0; + __m128 _v1_channel3 = _val_channel0; + __m128 _v1_channel4 = _val_channel0; + __m128 _v1_channel5 = _val_channel0; + __m128 _v1_channel6 = _val_channel0; + __m128 _v1_channel7 = _val_channel0; + __m128 _v2_channel0 = _val_channel0; + __m128 _v2_channel1 = _val_channel0; + __m128 _v2_channel2 = _val_channel0; + __m128 _v2_channel3 = _val_channel0; + __m128 _v2_channel4 = _val_channel0; + __m128 _v2_channel5 = _val_channel0; + __m128 _v2_channel6 = _val_channel0; + __m128 _v2_channel7 = _val_channel0; + __m128 _v3_channel0 = _val_channel0; + __m128 _v3_channel1 = _val_channel0; + __m128 _v3_channel2 = _val_channel0; + __m128 _v3_channel3 = _val_channel0; + __m128 _v3_channel4 = _val_channel0; + __m128 _v3_channel5 = _val_channel0; + __m128 _v3_channel6 = _val_channel0; + __m128 _v3_channel7 = _val_channel0; + __m128 _v4_channel0 = _val_channel0; + __m128 _v4_channel1 = _val_channel0; + __m128 _v4_channel2 = _val_channel0; + __m128 _v4_channel3 = _val_channel0; + __m128 _v4_channel4 = _val_channel0; + __m128 _v4_channel5 = _val_channel0; + __m128 _v4_channel6 = _val_channel0; + __m128 _v4_channel7 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm_load1_ps(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 7); + } + if (v2_cond) + { + _v2_channel0 = _mm_load1_ps(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 7); + } + if (v3_cond) + { + _v3_channel0 = _mm_load1_ps(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 7); + } + if (v4_cond) + { + _v4_channel0 = _mm_load1_ps(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 7); + } + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val_channel0 = _mm_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm_comp_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm_comp_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm_comp_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm_comp_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v4_channel7, _w4, _val_channel7); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val_channel0 = _mm_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm_mul_ps(_val_channel7, _mask); + } + __m128 _conv_w0 = _mm_load_ps(kptr); + __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m128 _conv_w2 = _mm_load_ps(kptr + 8); // 2 * out_elempack + __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m128 _conv_w4 = _mm_load_ps(kptr + 16); // 4 * out_elempack + __m128 _conv_w5 = _mm_load_ps(kptr + 20); // 5 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m128 _conv_w6 = _mm_load_ps(kptr + 24); // 6 * out_elempack + __m128 _conv_w7 = _mm_load_ps(kptr + 28); // 7 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel7, _conv_w7, _sum); + kptr += wstep; + } + } + } + _sum = activation_sse(_sum, activation_type, activation_params); + _mm_storeu_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_sgemm.h b/src/layer/x86/deformableconv2d_sgemm.h new file mode 100644 index 00000000000..648af448b12 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm.h @@ -0,0 +1,136 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm.h" + +static void deformableconv2d_im2col_sgemm_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + int h_low = 0; + int w_low = 0; + int h_high = 0; + int w_high = 0; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + if (cond) + { + h_low = floor(h_im); + w_low = floor(w_im); + h_high = h_low + 1; + w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + for (int ic = 0; ic < inch; ic++) + { + float val = 0.f; + if (cond) + { + float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f; + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + bottom_im2col.channel(ic).row(i * kernel_w + j)[h_col * outw + w_col] = val * mask_; + } + } + } + } + } + } + + im2col_sgemm_sse(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16.h b/src/layer/x86/deformableconv2d_sgemm_pack16.h new file mode 100644 index 00000000000..37aab40f1e4 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack16.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack16.h" + +static void deformableconv2d_im2col_sgemm_pack16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 16; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val = _mm512_loadu_ps(zeros_ptr); + if (cond) + { + __m512 _v1 = _val; + __m512 _v2 = _val; + __m512 _v3 = _val; + __m512 _v4 = _val; + if (v1_cond) + _v1 = _mm512_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm512_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm512_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm512_load_ps(data_im_ptr + v4_pos * elempack); + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val = _mm512_fmadd_ps(_v1, _w1, _val); + _val = _mm512_fmadd_ps(_v2, _w2, _val); + _val = _mm512_fmadd_ps(_v3, _w3, _val); + _val = _mm512_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val = _mm512_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm512_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack16_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16to1.h b/src/layer/x86/deformableconv2d_sgemm_pack16to1.h new file mode 100644 index 00000000000..686333e6ee4 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack16to1.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack16to1.h" + +static void deformableconv2d_im2col_sgemm_pack16to1_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 16; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val = _mm512_loadu_ps(zeros_ptr); + if (cond) + { + __m512 _v1 = _val; + __m512 _v2 = _val; + __m512 _v3 = _val; + __m512 _v4 = _val; + if (v1_cond) + _v1 = _mm512_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm512_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm512_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm512_load_ps(data_im_ptr + v4_pos * elempack); + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val = _mm512_fmadd_ps(_v1, _w1, _val); + _val = _mm512_fmadd_ps(_v2, _w2, _val); + _val = _mm512_fmadd_ps(_v3, _w3, _val); + _val = _mm512_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val = _mm512_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm512_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack16to1_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16to4.h b/src/layer/x86/deformableconv2d_sgemm_pack16to4.h new file mode 100644 index 00000000000..a7438d1f983 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack16to4.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack16to4.h" + +static void deformableconv2d_im2col_sgemm_pack16to4_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 16; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val = _mm512_loadu_ps(zeros_ptr); + if (cond) + { + __m512 _v1 = _val; + __m512 _v2 = _val; + __m512 _v3 = _val; + __m512 _v4 = _val; + if (v1_cond) + _v1 = _mm512_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm512_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm512_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm512_load_ps(data_im_ptr + v4_pos * elempack); + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val = _mm512_fmadd_ps(_v1, _w1, _val); + _val = _mm512_fmadd_ps(_v2, _w2, _val); + _val = _mm512_fmadd_ps(_v3, _w3, _val); + _val = _mm512_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val = _mm512_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm512_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack16to4_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16to8.h b/src/layer/x86/deformableconv2d_sgemm_pack16to8.h new file mode 100644 index 00000000000..d441d254940 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack16to8.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack16to8.h" + +static void deformableconv2d_im2col_sgemm_pack16to8_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 16; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val = _mm512_loadu_ps(zeros_ptr); + if (cond) + { + __m512 _v1 = _val; + __m512 _v2 = _val; + __m512 _v3 = _val; + __m512 _v4 = _val; + if (v1_cond) + _v1 = _mm512_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm512_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm512_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm512_load_ps(data_im_ptr + v4_pos * elempack); + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val = _mm512_fmadd_ps(_v1, _w1, _val); + _val = _mm512_fmadd_ps(_v2, _w2, _val); + _val = _mm512_fmadd_ps(_v3, _w3, _val); + _val = _mm512_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val = _mm512_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm512_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack16to8_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack1to16.h b/src/layer/x86/deformableconv2d_sgemm_pack1to16.h new file mode 100644 index 00000000000..d30c11926fd --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack1to16.h @@ -0,0 +1,136 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack1to16.h" + +static void deformableconv2d_im2col_sgemm_pack1to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + int h_low = 0; + int w_low = 0; + int h_high = 0; + int w_high = 0; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + if (cond) + { + h_low = floor(h_im); + w_low = floor(w_im); + h_high = h_low + 1; + w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + for (int ic = 0; ic < inch; ic++) + { + float val = 0.f; + if (cond) + { + float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f; + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + bottom_im2col.channel(ic).row(i * kernel_w + j)[h_col * outw + w_col] = val * mask_; + } + } + } + } + } + } + + im2col_sgemm_pack1to16_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack1to4.h b/src/layer/x86/deformableconv2d_sgemm_pack1to4.h new file mode 100644 index 00000000000..0070999c05c --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack1to4.h @@ -0,0 +1,136 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack1to4.h" + +static void deformableconv2d_im2col_sgemm_pack1to4_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + int h_low = 0; + int w_low = 0; + int h_high = 0; + int w_high = 0; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + if (cond) + { + h_low = floor(h_im); + w_low = floor(w_im); + h_high = h_low + 1; + w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + for (int ic = 0; ic < inch; ic++) + { + float val = 0.f; + if (cond) + { + float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f; + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + bottom_im2col.channel(ic).row(i * kernel_w + j)[h_col * outw + w_col] = val * mask_; + } + } + } + } + } + } + + im2col_sgemm_pack1to4_sse(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack1to8.h b/src/layer/x86/deformableconv2d_sgemm_pack1to8.h new file mode 100644 index 00000000000..d02c4245d7c --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack1to8.h @@ -0,0 +1,136 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack1to8.h" + +static void deformableconv2d_im2col_sgemm_pack1to8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + int h_low = 0; + int w_low = 0; + int h_high = 0; + int w_high = 0; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + if (cond) + { + h_low = floor(h_im); + w_low = floor(w_im); + h_high = h_low + 1; + w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + for (int ic = 0; ic < inch; ic++) + { + float val = 0.f; + if (cond) + { + float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f; + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + bottom_im2col.channel(ic).row(i * kernel_w + j)[h_col * outw + w_col] = val * mask_; + } + } + } + } + } + } + + im2col_sgemm_pack1to8_avx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4.h b/src/layer/x86/deformableconv2d_sgemm_pack4.h new file mode 100644 index 00000000000..140fa78e522 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack4.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack4.h" + +static void deformableconv2d_im2col_sgemm_pack4_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 4; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val = _mm_loadu_ps(zeros_ptr); + if (cond) + { + __m128 _v1 = _val; + __m128 _v2 = _val; + __m128 _v3 = _val; + __m128 _v4 = _val; + if (v1_cond) + _v1 = _mm_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm_load_ps(data_im_ptr + v4_pos * elempack); + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val = _mm_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val = _mm_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack4_sse(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4to1.h b/src/layer/x86/deformableconv2d_sgemm_pack4to1.h new file mode 100644 index 00000000000..d5d7b57cab5 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack4to1.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack4to1.h" + +static void deformableconv2d_im2col_sgemm_pack4to1_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 4; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val = _mm_loadu_ps(zeros_ptr); + if (cond) + { + __m128 _v1 = _val; + __m128 _v2 = _val; + __m128 _v3 = _val; + __m128 _v4 = _val; + if (v1_cond) + _v1 = _mm_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm_load_ps(data_im_ptr + v4_pos * elempack); + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val = _mm_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val = _mm_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack4to1_sse(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4to16.h b/src/layer/x86/deformableconv2d_sgemm_pack4to16.h new file mode 100644 index 00000000000..7eef68bb01a --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack4to16.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack4to16.h" + +static void deformableconv2d_im2col_sgemm_pack4to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 4; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val = _mm_loadu_ps(zeros_ptr); + if (cond) + { + __m128 _v1 = _val; + __m128 _v2 = _val; + __m128 _v3 = _val; + __m128 _v4 = _val; + if (v1_cond) + _v1 = _mm_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm_load_ps(data_im_ptr + v4_pos * elempack); + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val = _mm_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val = _mm_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack4to16_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4to8.h b/src/layer/x86/deformableconv2d_sgemm_pack4to8.h new file mode 100644 index 00000000000..1096d5dc834 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack4to8.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack4to8.h" + +static void deformableconv2d_im2col_sgemm_pack4to8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 4; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val = _mm_loadu_ps(zeros_ptr); + if (cond) + { + __m128 _v1 = _val; + __m128 _v2 = _val; + __m128 _v3 = _val; + __m128 _v4 = _val; + if (v1_cond) + _v1 = _mm_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm_load_ps(data_im_ptr + v4_pos * elempack); + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val = _mm_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val = _mm_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack4to8_avx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8.h b/src/layer/x86/deformableconv2d_sgemm_pack8.h new file mode 100644 index 00000000000..fce55606859 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack8.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack8.h" + +static void deformableconv2d_im2col_sgemm_pack8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 8; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val = _mm256_loadu_ps(zeros_ptr); + if (cond) + { + __m256 _v1 = _val; + __m256 _v2 = _val; + __m256 _v3 = _val; + __m256 _v4 = _val; + if (v1_cond) + _v1 = _mm256_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm256_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm256_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm256_load_ps(data_im_ptr + v4_pos * elempack); + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val = _mm256_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm256_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm256_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm256_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val = _mm256_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm256_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack8_avx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8to1.h b/src/layer/x86/deformableconv2d_sgemm_pack8to1.h new file mode 100644 index 00000000000..635c08625ab --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack8to1.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack8to1.h" + +static void deformableconv2d_im2col_sgemm_pack8to1_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 8; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val = _mm256_loadu_ps(zeros_ptr); + if (cond) + { + __m256 _v1 = _val; + __m256 _v2 = _val; + __m256 _v3 = _val; + __m256 _v4 = _val; + if (v1_cond) + _v1 = _mm256_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm256_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm256_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm256_load_ps(data_im_ptr + v4_pos * elempack); + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val = _mm256_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm256_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm256_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm256_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val = _mm256_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm256_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack8to1_avx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8to16.h b/src/layer/x86/deformableconv2d_sgemm_pack8to16.h new file mode 100644 index 00000000000..161e983f1a0 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack8to16.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack8to16.h" + +static void deformableconv2d_im2col_sgemm_pack8to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 8; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val = _mm256_loadu_ps(zeros_ptr); + if (cond) + { + __m256 _v1 = _val; + __m256 _v2 = _val; + __m256 _v3 = _val; + __m256 _v4 = _val; + if (v1_cond) + _v1 = _mm256_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm256_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm256_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm256_load_ps(data_im_ptr + v4_pos * elempack); + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val = _mm256_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm256_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm256_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm256_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val = _mm256_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm256_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack8to16_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8to4.h b/src/layer/x86/deformableconv2d_sgemm_pack8to4.h new file mode 100644 index 00000000000..45c853d2262 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack8to4.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack8to4.h" + +static void deformableconv2d_im2col_sgemm_pack8to4_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 8; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val = _mm256_loadu_ps(zeros_ptr); + if (cond) + { + __m256 _v1 = _val; + __m256 _v2 = _val; + __m256 _v3 = _val; + __m256 _v4 = _val; + if (v1_cond) + _v1 = _mm256_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm256_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm256_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm256_load_ps(data_im_ptr + v4_pos * elempack); + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val = _mm256_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm256_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm256_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm256_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val = _mm256_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm256_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack8to4_avx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp index 869815283d9..caff2e17d06 100644 --- a/src/layer/x86/deformableconv2d_x86.cpp +++ b/src/layer/x86/deformableconv2d_x86.cpp @@ -14,63 +14,352 @@ #include "deformableconv2d_x86.h" +#if __SSE2__ +#include +#if __SSE4_1__ +#include +#if __AVX__ +#include +#endif +#endif // __SSE4_1__ +#endif // __SSE2__ +#include "x86_activation.h" +#include "x86_usability.h" + +#include "benchmark.h" +#include "cpu.h" #include "layer_type.h" namespace ncnn { +#include "deformableconv2d_sgemm.h" + +#if __SSE2__ +#include "deformableconv2d_pack4.h" +#include "deformableconv2d_pack1to4.h" +#include "deformableconv2d_pack4to1.h" + +#include "deformableconv2d_sgemm_pack4.h" +#include "deformableconv2d_sgemm_pack1to4.h" +#include "deformableconv2d_sgemm_pack4to1.h" + +#if __AVX__ +#include "deformableconv2d_pack8.h" +#include "deformableconv2d_pack4to8.h" +#include "deformableconv2d_pack1to8.h" +#include "deformableconv2d_pack8to4.h" +#include "deformableconv2d_pack8to1.h" + +#include "deformableconv2d_sgemm_pack8.h" +#include "deformableconv2d_sgemm_pack4to8.h" +#include "deformableconv2d_sgemm_pack1to8.h" +#include "deformableconv2d_sgemm_pack8to4.h" +#include "deformableconv2d_sgemm_pack8to1.h" + +#if __AVX512F__ +#include "deformableconv2d_pack16.h" +#include "deformableconv2d_pack8to16.h" +#include "deformableconv2d_pack4to16.h" +#include "deformableconv2d_pack1to16.h" +#include "deformableconv2d_pack16to8.h" +#include "deformableconv2d_pack16to4.h" +#include "deformableconv2d_pack16to1.h" + +#include "deformableconv2d_sgemm_pack16.h" +#include "deformableconv2d_sgemm_pack8to16.h" +#include "deformableconv2d_sgemm_pack4to16.h" +#include "deformableconv2d_sgemm_pack1to16.h" +#include "deformableconv2d_sgemm_pack16to8.h" +#include "deformableconv2d_sgemm_pack16to4.h" +#include "deformableconv2d_sgemm_pack16to1.h" +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ + DeformableConv2D_x86::DeformableConv2D_x86() { - one_blob_only = false; - support_inplace = false; +#if __SSE2__ + support_packing = true; +#endif // __SSE2__ - inner_product = 0; - permute = 0; + activation = 0; } -int DeformableConv2D_x86::create_pipeline(const Option& opt) +static int _4Dindex_to_1Dindex(int i0, int i1, int i2, int i3, int l1, int l2, int l3) +{ + return ((i0 * l1 + i1) * l2 + i2) * l3 + i3; +} + +static int _6Dindex_to_1Dindex(int i0, int i1, int i2, int i3, int i4, int i5, int l1, int l2, int l3, int l4, int l5) +{ + return ((((i0 * l1 + i1) * l2 + i2) * l3 + i3) * l4 + i4) * l5 + i5; +} + +static void deformableconv2d_transform_kernel_packed_sse(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) { - const int in_c = weight_data_size / (num_output * kernel_h * kernel_w); + const int maxk = kernel_w * kernel_h; + + // src = kw-kh-inch-outch + // dst = pb-pa-inch/pa-kw-kh-outch/pb { - Mat weight_3d = weight_data.reshape(kernel_w * kernel_h, in_c, num_output); - weight_data_t.create(in_c, kernel_w * kernel_h, num_output); - if (weight_data_t.empty()) - return -100; - for (int q = 0; q < num_output; q++) - { - const Mat m = weight_3d.channel(q); - float* outptr = weight_data_t.channel(q); + const float* weight_ptr = weight_data; - for (int i = 0; i < kernel_w * kernel_h; i++) + weight_data_tm.create(num_input * maxk * num_output / (elempack * out_elempack), (size_t)4u * elempack * out_elempack, elempack * out_elempack); + float* ptr = weight_data_tm; + for (int oc = 0; oc < num_output; oc++) + { + for (int i = 0; i < kernel_h; i++) { - for (int j = 0; j < in_c; j++) + for (int j = 0; j < kernel_w; j++) { - *outptr++ = m.row(j)[i]; + for (int ic = 0; ic < num_input; ic++) + { + ptr[_6Dindex_to_1Dindex(oc / out_elempack, i, j, ic / elempack, ic % elempack, oc % out_elempack, kernel_h, kernel_w, num_input / elempack, elempack, out_elempack)] = weight_ptr[_4Dindex_to_1Dindex(oc, ic, i, j, num_input, kernel_h, kernel_w)]; + } } } } - weight_3d.release(); - weight_data_t = weight_data_t.reshape(in_c * kernel_w * kernel_h, num_output); + weight_data_tm = weight_data_tm.reshape(num_input / elempack, maxk, num_output / out_elempack); + } +} + +int DeformableConv2D_x86::create_pipeline(const Option& opt) +{ + activation = create_activation_layer(activation_type, activation_params, opt); + + int kernel_size = kernel_w * kernel_h; + int num_input = weight_data_size / kernel_size / num_output; + + int elempack = 1; + int out_elempack = 1; + +#if __SSE2__ + if (opt.use_packing_layout) + { +#if __AVX512F__ + elempack = num_input % 16 == 0 ? 16 : num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1; + out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#elif __AVX__ + elempack = num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1; + out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#else + elempack = num_input % 4 == 0 ? 4 : 1; + out_elempack = num_output % 4 == 0 ? 4 : 1; +#endif + } +#endif // __SSE2__ + +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + if (elempack == 16 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack16_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 8 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack8to16_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 16 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack16to8_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 4 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack4to16_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 16 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack16to4_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 1 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack1to16_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 16 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack16to1_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + +#endif // __AVX512F__ + + // pack8 + if (elempack == 8 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack8_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack4to8 + if (elempack == 4 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack4to8_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack1to8 + if (elempack == 1 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack1to8_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack8to4 + if (elempack == 8 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack8to4_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack8to1 + if (elempack == 8 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack8to1_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } +#endif // __AVX__ + + // pack4 + if (elempack == 4 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack4_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack1to4 + if (elempack == 1 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack1to4_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack4to1 + if (elempack == 4 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack4to1_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } +#endif // __SSE2__ - inner_product = ncnn::create_layer(ncnn::LayerType::InnerProduct); - ncnn::ParamDict pd; - pd.set(0, num_output); - pd.set(1, bias_term); - pd.set(2, weight_data_size); - pd.set(9, activation_type); - pd.set(10, activation_params); - inner_product->load_param(pd); - ncnn::Mat weights[2]; - weights[0] = weight_data_t; - if (bias_term) - weights[1] = bias_data; - inner_product->load_model(ncnn::ModelBinFromMatArray(weights)); - inner_product->create_pipeline(opt); + // pack1 + if (elempack == 1 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + weight_data_tm = weight_data; + } + } - permute = ncnn::create_layer(ncnn::LayerType::Permute); - ncnn::ParamDict permute_pd; - permute_pd.set(0, 1); - permute->load_param(permute_pd); - permute->create_pipeline(opt); + if (opt.lightmode) + { + weight_data.release(); } return 0; @@ -78,17 +367,11 @@ int DeformableConv2D_x86::create_pipeline(const Option& opt) int DeformableConv2D_x86::destroy_pipeline(const Option& opt) { - if (inner_product) + if (activation) { - inner_product->destroy_pipeline(opt); - delete inner_product; - inner_product = 0; - } - if (permute) - { - permute->destroy_pipeline(opt); - delete permute; - permute = 0; + activation->destroy_pipeline(opt); + delete activation; + activation = 0; } return 0; @@ -98,134 +381,427 @@ int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vec { const Mat& bottom_blob = bottom_blobs[0]; const Mat& offset = bottom_blobs[1]; - const bool has_mask = (bottom_blobs.size() == 3); + Mat& top_blob = top_blobs[0]; - const int w = bottom_blob.w; - const int h = bottom_blob.h; - const int in_c = bottom_blob.c; - const size_t elemsize = bottom_blob.elemsize; + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int out_w = (w + pad_left + pad_right - kernel_extent_w) / stride_w + 1; const int out_h = (h + pad_top + pad_bottom - kernel_extent_h) / stride_h + 1; - // output = im2col matmul weight_t, im2col.shape is [out_h * out_w, kernel_h * kernel_w * in_c] (in python), - // weight_t.shape is [num_output, kernel_h * kernel_w * in_c] (in python), - // output.shape is [out_h * out_w, num_output] (in python). - Mat im2col; - im2col.create(kernel_h * kernel_w * in_c * out_h * out_w, elemsize, opt.blob_allocator); - if (im2col.empty()) - return -100; + int out_elempack = 1; +#if __SSE2__ + if (opt.use_packing_layout) + { +#if __AVX512F__ + out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#elif __AVX__ + out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#else + out_elempack = num_output % 4 == 0 ? 4 : 1; +#endif + } +#endif // __SSE2__ + size_t out_elemsize = elemsize / elempack * out_elempack; - Mat& output = top_blobs[0]; - output.create(num_output, out_h * out_w, elemsize, opt.blob_allocator); - if (output.empty()) + top_blob.create(out_w, out_h, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) return -100; - Mat bottom_blob_flatten = bottom_blob.reshape(w * h * in_c); - Mat offset_flatten = offset.reshape(offset.w * offset.h * offset.c); - const float* data_im_ptr = bottom_blob_flatten; - const float* data_offset_ptr = offset_flatten; - float* im2col_ptr = im2col; + const int num_input = channels * elempack; - // im2col - #pragma omp parallel for num_threads(opt.num_threads) - for (int h_col = 0; h_col < out_h; h_col++) +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + if (elempack == 16 && out_elempack == 16) { - for (int w_col = 0; w_col < out_w; w_col++) + if (opt.use_sgemm_convolution) { - int h_in = h_col * stride_h - pad_top; - int w_in = w_col * stride_w - pad_left; - float* data_col_ptr = im2col_ptr + (h_col * out_w + w_col) * kernel_h * kernel_w * in_c; - for (int i = 0; i < kernel_h; i++) + deformableconv2d_im2col_sgemm_pack16_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) { - for (int j = 0; j < kernel_w; j++) - { - const int data_offset_h_ptr = (((i * kernel_w + j) * 2) * out_h + h_col) * out_w + w_col; - const int data_offset_w_ptr = (((i * kernel_w + j) * 2 + 1) * out_h + h_col) * out_w + w_col; - - const float offset_h = data_offset_ptr[data_offset_h_ptr]; - const float offset_w = data_offset_ptr[data_offset_w_ptr]; - const float mask_ = has_mask ? bottom_blobs[2].channel(i * kernel_w + j).row(h_col)[w_col] : 1.f; - const float h_im = h_in + i * dilation_h + offset_h; - const float w_im = w_in + j * dilation_w + offset_w; - - // Bilinear - const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; - float w1 = 0.f; - float w2 = 0.f; - float w3 = 0.f; - float w4 = 0.f; - bool v1_cond = false; - bool v2_cond = false; - bool v3_cond = false; - bool v4_cond = false; - int v1_pos = 0; - int v2_pos = 0; - int v3_pos = 0; - int v4_pos = 0; - if (cond) - { - int h_low = floor(h_im); - int w_low = floor(w_im); - int h_high = h_low + 1; - int w_high = w_low + 1; - - float lh = h_im - h_low; - float lw = w_im - w_low; - float hh = 1 - lh; - float hw = 1 - lw; - - v1_cond = (h_low >= 0 && w_low >= 0); - v2_cond = (h_low >= 0 && w_high <= w - 1); - v3_cond = (h_high <= h - 1 && w_low >= 0); - v4_cond = (h_high <= h - 1 && w_high <= w - 1); - if (v1_cond) - v1_pos = h_low * w + w_low; - if (v2_cond) - v2_pos = h_low * w + w_high; - if (v3_cond) - v3_pos = h_high * w + w_low; - if (v4_cond) - v4_pos = h_high * w + w_high; - - w1 = hh * hw; - w2 = hh * lw; - w3 = lh * hw; - w4 = lh * lw; - } + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack16_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 8 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack8to16_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack8to16_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 16 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack16to8_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack16to8_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 4 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack4to16_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack4to16_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 16 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack16to4_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack16to4_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 1 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack1to16_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack1to16_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 16 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack16to1_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack16to1_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + +#endif // __AVX512F__ + + if (elempack == 8 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack8_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack8_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 1 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack1to8_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack1to8_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 4 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack4to8_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack4to8_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 8 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack8to1_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack8to1_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 8 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack8to4_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack8to4_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } +#endif // __AVX__ + + if (elempack == 4 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack4_sse(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack4_sse(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 1 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack1to4_sse(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack1to4_sse(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 4 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack4to1_sse(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack4to1_sse(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } +#endif // __SSE2__ + + if (elempack == 1 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_sse(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + const float* weight_ptr = weight_data_tm; - const float* data_im_channel_ptr = data_im_ptr; - for (int c_im = 0; c_im < in_c; c_im++) + // naive deformable conv + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < out_h; h_col++) + { + for (int w_col = 0; w_col < out_w; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < num_output; oc++) { - float val = 0.f; - if (cond) + float sum = 0.f; + if (bias_term) + sum = bias_data[oc]; + for (int i = 0; i < kernel_h; i++) { - float v1 = v1_cond ? data_im_channel_ptr[v1_pos] : 0.f; - float v2 = v2_cond ? data_im_channel_ptr[v2_pos] : 0.f; - float v3 = v3_cond ? data_im_channel_ptr[v3_pos] : 0.f; - float v4 = v4_cond ? data_im_channel_ptr[v4_pos] : 0.f; - val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + int h_low = 0; + int w_low = 0; + int h_high = 0; + int w_high = 0; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + if (cond) + { + h_low = floor(h_im); + w_low = floor(w_im); + h_high = h_low + 1; + w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + for (int ic = 0; ic < channels; ic++) + { + float val = 0.f; + if (cond) + { + float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f; + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + sum += val * mask_ * weight_ptr[((oc * channels + ic) * kernel_h + i) * kernel_w + j]; + } + } } - *data_col_ptr = val * mask_; - data_col_ptr += 1; - data_im_channel_ptr += h * w; + top_blob.channel(oc).row(h_col)[w_col] = activation_ss(sum, activation_type, activation_params); } } } } } - im2col = im2col.reshape(kernel_h * kernel_w * in_c, out_h * out_w); - // call InnerProduct - inner_product->forward(im2col, output, opt); - ncnn::Mat output_t; - // call Permute - permute->forward(output, output_t, opt); - output_t = output_t.reshape(out_w, out_h, num_output); - top_blobs[0] = output_t; + return 0; } diff --git a/src/layer/x86/deformableconv2d_x86.h b/src/layer/x86/deformableconv2d_x86.h index 0e21c9392af..a4f02f8fccb 100644 --- a/src/layer/x86/deformableconv2d_x86.h +++ b/src/layer/x86/deformableconv2d_x86.h @@ -30,10 +30,10 @@ class DeformableConv2D_x86 : virtual public DeformableConv2D virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; public: - Mat weight_data_t; + Layer* activation; - Layer* inner_product; - Layer* permute; + Mat weight_data_tm; + Mat weight_sgemm_data; }; } // namespace ncnn diff --git a/tests/test_deformableconv2d.cpp b/tests/test_deformableconv2d.cpp index 01511e54496..b62557df98c 100644 --- a/tests/test_deformableconv2d.cpp +++ b/tests/test_deformableconv2d.cpp @@ -93,7 +93,23 @@ static int test_deformableconv2d_0() || test_deformableconv2d(9, 7, 8, 4, k, d, s, p, 1) || test_deformableconv2d(9, 7, 8, 13, k, d, s, p, 0) || test_deformableconv2d(9, 7, 13, 8, k, d, s, p, 1) - || test_deformableconv2d(9, 7, 16, 16, k, d, s, p, 0); + || test_deformableconv2d(9, 7, 16, 16, k, d, s, p, 0) + || test_deformableconv2d(16, 16, 1 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 16 * 3, k, d, s, p, 1); if (ret != 0) return -1;