x86 optimization for convolution int8 packed unified elempack (#4861)

Tencent · Jul 22, 2023 · 5570970 · 5570970
1 parent 2303b77
commit 5570970
Show file tree

Hide file tree

Showing 18 changed files with 5,589 additions and 459 deletions.
diff --git a/.github/workflows/linux-aarch64-cpu-gcc.yml b/.github/workflows/linux-aarch64-cpu-gcc.yml
@@ -155,7 +155,7 @@ jobs:
       uses: actions/cache@v3
       with:
         path: qemu-install
-        key: qemu-aarch64-install-20220502-2
+        key: qemu-aarch64-install-20230717
     - name: install-qemu-build-deps
       if: steps.cache-qemu.outputs.cache-hit != 'true'
       run: |
@@ -167,7 +167,7 @@ jobs:
       with:
         repository: qemu/qemu
         path: qemu
-        ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
+        ref: ed8ad9728a9c0eec34db9dff61dfa2f1dd625637
     - name: qemu
       if: steps.cache-qemu.outputs.cache-hit != 'true'
       run: |

diff --git a/src/layer/arm/convolution_3x3_winograd.h b/src/layer/arm/convolution_3x3_winograd.h
@@ -6302,9 +6302,9 @@ static inline void conv3x3s1_winograd43_transform_input_tile(const Mat& bottom_b
                         float32x4x2_t _t01 = vzipq_f32(_t0, _t1);
 
                         _r0 = vget_low_f32(_t01.val[0]);
-                        if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
-                        if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
-                        if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
+                        if (tj * 4 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
+                        if (tj * 4 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
+                        if (tj * 4 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
                         if (tj * 4 + 4 < w)
                         {
                             float tmp[2] = {r0[4], r1[4]};
@@ -8081,9 +8081,9 @@ static inline void conv3x3s1_winograd63_transform_input_tile(const Mat& bottom_b
                         float32x4x2_t _t01 = vzipq_f32(_t0, _t1);
 
                         _r0 = vget_low_f32(_t01.val[0]);
-                        if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
-                        if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
-                        if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
+                        if (tj * 6 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
+                        if (tj * 6 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
+                        if (tj * 6 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
                         if (tj * 6 + 4 < w)
                         {
                             _t0 = vld1q_f32(r0 + 4);

diff --git a/src/layer/arm/convolution_3x3_winograd_bf16s.h b/src/layer/arm/convolution_3x3_winograd_bf16s.h
@@ -1540,9 +1540,9 @@ static inline void conv3x3s1_winograd43_transform_input_tile_bf16s(const Mat& bo
                         float32x4_t _t1_fp32 = bfloat2float(_t01.val[1]);
 
                         _r0 = vget_low_f32(_t0_fp32);
-                        if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
-                        if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
-                        if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
+                        if (tj * 4 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
+                        if (tj * 4 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
+                        if (tj * 4 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
                         if (tj * 4 + 4 < w)
                         {
                             float tmp[2] = {bfloat16_to_float32(r0[4]), bfloat16_to_float32(r1[4])};
@@ -3211,9 +3211,9 @@ static inline void conv3x3s1_winograd63_transform_input_tile_bf16s(const Mat& bo
                         float32x4_t _t1_fp32 = bfloat2float(_t01.val[1]);
 
                         _r0 = vget_low_f32(_t0_fp32);
-                        if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
-                        if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
-                        if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
+                        if (tj * 6 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
+                        if (tj * 6 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
+                        if (tj * 6 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
                         if (tj * 6 + 4 < w)
                         {
                             _t0 = vld1_u16(r0 + 4);

diff --git a/src/layer/x86/convolution_int8.h b/src/layer/x86/convolution_int8.h
diff --git a/src/layer/x86/convolution_pack1to4_int8.h b/src/layer/x86/convolution_pack1to4_int8.h
diff --git a/src/layer/x86/convolution_pack8to1_int8.h b/src/layer/x86/convolution_pack8to1_int8.h