Skip to content

Commit

Permalink
x86 optimization for convolution int8 packed unified elempack (#4861)
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui authored Jul 22, 2023
1 parent 2303b77 commit 5570970
Show file tree
Hide file tree
Showing 18 changed files with 5,589 additions and 459 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/linux-aarch64-cpu-gcc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ jobs:
uses: actions/cache@v3
with:
path: qemu-install
key: qemu-aarch64-install-20220502-2
key: qemu-aarch64-install-20230717
- name: install-qemu-build-deps
if: steps.cache-qemu.outputs.cache-hit != 'true'
run: |
Expand All @@ -167,7 +167,7 @@ jobs:
with:
repository: qemu/qemu
path: qemu
ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
ref: ed8ad9728a9c0eec34db9dff61dfa2f1dd625637
- name: qemu
if: steps.cache-qemu.outputs.cache-hit != 'true'
run: |
Expand Down
12 changes: 6 additions & 6 deletions src/layer/arm/convolution_3x3_winograd.h
Original file line number Diff line number Diff line change
Expand Up @@ -6302,9 +6302,9 @@ static inline void conv3x3s1_winograd43_transform_input_tile(const Mat& bottom_b
float32x4x2_t _t01 = vzipq_f32(_t0, _t1);

_r0 = vget_low_f32(_t01.val[0]);
if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
if (tj * 4 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
if (tj * 4 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
if (tj * 4 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
if (tj * 4 + 4 < w)
{
float tmp[2] = {r0[4], r1[4]};
Expand Down Expand Up @@ -8081,9 +8081,9 @@ static inline void conv3x3s1_winograd63_transform_input_tile(const Mat& bottom_b
float32x4x2_t _t01 = vzipq_f32(_t0, _t1);

_r0 = vget_low_f32(_t01.val[0]);
if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
if (tj * 6 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
if (tj * 6 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
if (tj * 6 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
if (tj * 6 + 4 < w)
{
_t0 = vld1q_f32(r0 + 4);
Expand Down
12 changes: 6 additions & 6 deletions src/layer/arm/convolution_3x3_winograd_bf16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -1540,9 +1540,9 @@ static inline void conv3x3s1_winograd43_transform_input_tile_bf16s(const Mat& bo
float32x4_t _t1_fp32 = bfloat2float(_t01.val[1]);

_r0 = vget_low_f32(_t0_fp32);
if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
if (tj * 4 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
if (tj * 4 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
if (tj * 4 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
if (tj * 4 + 4 < w)
{
float tmp[2] = {bfloat16_to_float32(r0[4]), bfloat16_to_float32(r1[4])};
Expand Down Expand Up @@ -3211,9 +3211,9 @@ static inline void conv3x3s1_winograd63_transform_input_tile_bf16s(const Mat& bo
float32x4_t _t1_fp32 = bfloat2float(_t01.val[1]);

_r0 = vget_low_f32(_t0_fp32);
if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
if (tj * 6 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
if (tj * 6 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
if (tj * 6 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
if (tj * 6 + 4 < w)
{
_t0 = vld1_u16(r0 + 4);
Expand Down
82 changes: 0 additions & 82 deletions src/layer/x86/convolution_int8.h

This file was deleted.

89 changes: 0 additions & 89 deletions src/layer/x86/convolution_pack1to4_int8.h

This file was deleted.

96 changes: 0 additions & 96 deletions src/layer/x86/convolution_pack8to1_int8.h

This file was deleted.

Loading

0 comments on commit 5570970

Please sign in to comment.