From d28937e18081a64b74bcd8ffacabd9445a9da12b Mon Sep 17 00:00:00 2001 From: George Steed Date: Mon, 21 Oct 2024 16:05:57 +0100 Subject: [PATCH] PR#444 from vvenc with little changes. original commit message: Add Neon implementations of MCIF simdFilter{4,8}xX_N8 (#444) Mostly a copy of the existing approach for simdFilter16xX_N8_neon. Also rename the helper functions used by the existing simdFilter16xX_N8_neon kernel to avoid name clashes with the new two kernels introduced by this commit. Running a video encoding job on a Neoverse V2 machine using the --preset=fast setting shows a ~1.5% improvement in reported FPS. --- .../arm/neon/InterpolationFilter_neon.cpp | 287 +++++++++++++++--- 1 file changed, 237 insertions(+), 50 deletions(-) diff --git a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp index 25a23e0d..22abd608 100644 --- a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp +++ b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp @@ -133,63 +133,51 @@ static void simdInterpolateN2_2D_neon( const ClpRng& clpRng, const Pel* src, con } } -static int16x8_t simdFilter16xX_N8_half( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st ) +static int16x4_t filter4xX_N8_neon( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st ) { - int16x8_t vsrca00 = vld1q_s16( src + 0 ); - int16x8_t vsrca01 = vld1q_s16( src + 1 ); - int16x8_t vsrca10 = vld1q_s16( src + 2 ); - int16x8_t vsrca11 = vld1q_s16( src + 3 ); - int16x8_t vsrcb00 = vld1q_s16( src + 4 ); - int16x8_t vsrcb01 = vld1q_s16( src + 5 ); - int16x8_t vsrcb10 = vld1q_s16( src + 6 ); - int16x8_t vsrcb11 = vld1q_s16( src + 7 ); - - int32x4_t a0 = vmull_s16( vget_low_s16( vsrca00 ), vget_low_s16( ch ) ); - int32x4_t a1 = vmull_s16( vget_low_s16( vsrca01 ), vget_low_s16( ch ) ); - int32x4_t a2 = vmull_s16( vget_low_s16( vsrca10 ), vget_low_s16( ch ) ); - int32x4_t a3 = vmull_s16( vget_low_s16( vsrca11 ), vget_low_s16( ch ) ); - - int32x4_t b0 = vmull_s16( vget_low_s16( vsrcb00 ), vget_low_s16( ch ) ); - int32x4_t b1 = vmull_s16( vget_low_s16( vsrcb01 ), vget_low_s16( ch ) ); - int32x4_t b2 = vmull_s16( vget_low_s16( vsrcb10 ), vget_low_s16( ch ) ); - int32x4_t b3 = vmull_s16( vget_low_s16( vsrcb11 ), vget_low_s16( ch ) ); - - a0 = vmlal_s16( a0, vget_high_s16( vsrca00 ), vget_high_s16( ch ) ); - a1 = vmlal_s16( a1, vget_high_s16( vsrca01 ), vget_high_s16( ch ) ); - a2 = vmlal_s16( a2, vget_high_s16( vsrca10 ), vget_high_s16( ch ) ); - a3 = vmlal_s16( a3, vget_high_s16( vsrca11 ), vget_high_s16( ch ) ); - - b0 = vmlal_s16( b0, vget_high_s16( vsrcb00 ), vget_high_s16( ch ) ); - b1 = vmlal_s16( b1, vget_high_s16( vsrcb01 ), vget_high_s16( ch ) ); - b2 = vmlal_s16( b2, vget_high_s16( vsrcb10 ), vget_high_s16( ch ) ); - b3 = vmlal_s16( b3, vget_high_s16( vsrcb11 ), vget_high_s16( ch ) ); + int16x8_t vsrca0 = vld1q_s16( src + 0 ); + int16x8_t vsrca1 = vld1q_s16( src + 1 ); + int16x8_t vsrca2 = vld1q_s16( src + 2 ); + int16x8_t vsrca3 = vld1q_s16( src + 3 ); - int32x4_t vsuma = vpaddq_s32( vpaddq_s32( a0, a1 ), vpaddq_s32( a2, a3 ) ); - int32x4_t vsumb = vpaddq_s32( vpaddq_s32( b0, b1 ), vpaddq_s32( b2, b3 ) ); + int32x4_t a0 = vmull_s16( vget_low_s16( vsrca0 ), vget_low_s16( ch ) ); + int32x4_t a1 = vmull_s16( vget_low_s16( vsrca1 ), vget_low_s16( ch ) ); + int32x4_t a2 = vmull_s16( vget_low_s16( vsrca2 ), vget_low_s16( ch ) ); + int32x4_t a3 = vmull_s16( vget_low_s16( vsrca3 ), vget_low_s16( ch ) ); - vsuma = vaddq_s32( vsuma, voffset1 ); - vsumb = vaddq_s32( vsumb, voffset1 ); + a0 = vmlal_s16( a0, vget_high_s16( vsrca0 ), vget_high_s16( ch ) ); + a1 = vmlal_s16( a1, vget_high_s16( vsrca1 ), vget_high_s16( ch ) ); + a2 = vmlal_s16( a2, vget_high_s16( vsrca2 ), vget_high_s16( ch ) ); + a3 = vmlal_s16( a3, vget_high_s16( vsrca3 ), vget_high_s16( ch ) ); - vsuma = vshlq_s32( vsuma, invshift1st ); - vsumb = vshlq_s32( vsumb, invshift1st ); + int32x4_t vsuma = vpaddq_s32( vpaddq_s32( a0, a1 ), vpaddq_s32( a2, a3 ) ); + vsuma = vaddq_s32( vsuma, voffset1 ); + vsuma = vshlq_s32( vsuma, invshift1st ); + return vqmovn_s32( vsuma ); +} - return vcombine_s16( vqmovn_s32( vsuma ), vqmovn_s32( vsumb ) ); +static int16x8_t filter8xX_N8_neon( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st ) +{ + int16x4_t lo = filter4xX_N8_neon( src + 0, ch, voffset1, invshift1st ); + int16x4_t hi = filter4xX_N8_neon( src + 4, ch, voffset1, invshift1st ); + return vcombine_s16( lo, hi ); } -static int16x8x2_t simdFilter16xX_N8_step( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st ) +static int16x8x2_t filter16xX_N8_neon( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st ) { - int16x8_t a = simdFilter16xX_N8_half( src + 0, ch, voffset1, invshift1st ); - int16x8_t b = simdFilter16xX_N8_half( src + 8, ch, voffset1, invshift1st ); + int16x8_t a = filter8xX_N8_neon( src + 0, ch, voffset1, invshift1st ); + int16x8_t b = filter8xX_N8_neon( src + 8, ch, voffset1, invshift1st ); return ( int16x8x2_t ){ a, b }; } template -static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const *src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride, int width, int height, TFilterCoeff const *coeffH, TFilterCoeff const *coeffV ) +static void simdFilter4xX_N8_neon( const ClpRng& clpRng, Pel const* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride, + int width, int height, TFilterCoeff const* coeffH, TFilterCoeff const* coeffV ) { OFFSET( src, srcStride, -3, -3 ); - // with the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be - // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20 + // With the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be + // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20. const int headRoom = std::max( 2, ( IF_INTERNAL_PREC - clpRng.bd ) ); const int shift1st = IF_FILTER_PREC - headRoom; const int shift2nd = IF_FILTER_PREC + headRoom; @@ -204,7 +192,93 @@ static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const *src, const { offset2nd = 0; } + const int32x4_t voffset1 = vdupq_n_s32( offset1st ); + + const int16x4_t vibdimin = vdup_n_s16( clpRng.min() ); + const int16x4_t vibdimax = vdup_n_s16( clpRng.max() ); + + int16x8_t ch = vld1q_s16( coeffH ); + int16x8_t cv = vld1q_s16( coeffV ); + + int32x4_t invshift1st = vdupq_n_s32( -shift1st ); + int32x4_t invshift2nd = vdupq_n_s32( -shift2nd ); + + int16x4_t vsrcv0 = filter4xX_N8_neon( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x4_t vsrcv1 = filter4xX_N8_neon( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x4_t vsrcv2 = filter4xX_N8_neon( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x4_t vsrcv3 = filter4xX_N8_neon( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x4_t vsrcv4 = filter4xX_N8_neon( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x4_t vsrcv5 = filter4xX_N8_neon( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x4_t vsrcv6 = filter4xX_N8_neon( src, ch, voffset1, invshift1st ); + src += srcStride; + + do + { + int16x4_t vsrcv7 = filter4xX_N8_neon( src, ch, voffset1, invshift1st ); + src += srcStride; + + int32x4_t vsum0 = vdupq_n_s32( offset2nd ); + vsum0 = vmlal_laneq_s16( vsum0, vsrcv0, cv, 0 ); + vsum0 = vmlal_laneq_s16( vsum0, vsrcv1, cv, 1 ); + vsum0 = vmlal_laneq_s16( vsum0, vsrcv2, cv, 2 ); + vsum0 = vmlal_laneq_s16( vsum0, vsrcv3, cv, 3 ); + vsum0 = vmlal_laneq_s16( vsum0, vsrcv4, cv, 4 ); + vsum0 = vmlal_laneq_s16( vsum0, vsrcv5, cv, 5 ); + vsum0 = vmlal_laneq_s16( vsum0, vsrcv6, cv, 6 ); + vsum0 = vmlal_laneq_s16( vsum0, vsrcv7, cv, 7 ); + + int16x4_t vsum01; + if( isLast ) // clip + { + vsum01 = vqmovn_s32( vshlq_s32( vsum0, invshift2nd ) ); + vsum01 = vmin_s16( vibdimax, vmax_s16( vibdimin, vsum01 ) ); + } + else + { + vsum01 = vqshrn_n_s32( vsum0, IF_FILTER_PREC ); + } + + vsrcv0 = vsrcv1; + vsrcv1 = vsrcv2; + vsrcv2 = vsrcv3; + vsrcv3 = vsrcv4; + vsrcv4 = vsrcv5; + vsrcv5 = vsrcv6; + vsrcv6 = vsrcv7; + + vst1_s16( dst, vsum01 ); + dst += dstStride; + } while( --height != 0 ); +} + +template +static void simdFilter8xX_N8_neon( const ClpRng& clpRng, Pel const* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride, + int width, int height, TFilterCoeff const* coeffH, TFilterCoeff const* coeffV ) +{ + OFFSET( src, srcStride, -3, -3 ); + // With the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be + // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20. + const int headRoom = std::max( 2, ( IF_INTERNAL_PREC - clpRng.bd ) ); + const int shift1st = IF_FILTER_PREC - headRoom; + const int shift2nd = IF_FILTER_PREC + headRoom; + + const int offset1st = -IF_INTERNAL_OFFS * ( 1 << shift1st ); + int offset2nd; + if( isLast ) + { + offset2nd = ( 1 << ( shift2nd - 1 ) ) + ( IF_INTERNAL_OFFS << IF_FILTER_PREC ); + } + else + { + offset2nd = 0; + } const int32x4_t voffset1 = vdupq_n_s32( offset1st ); const int16x8_t vibdimin = vdupq_n_s16( clpRng.min() ); @@ -216,24 +290,131 @@ static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const *src, const int32x4_t invshift1st = vdupq_n_s32( -shift1st ); int32x4_t invshift2nd = vdupq_n_s32( -shift2nd ); - int16x8x2_t vsrcv0 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st ); + int16x8_t vsrcv0 = filter8xX_N8_neon( src, ch, voffset1, invshift1st ); src += srcStride; - int16x8x2_t vsrcv1 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st ); + int16x8_t vsrcv1 = filter8xX_N8_neon( src, ch, voffset1, invshift1st ); src += srcStride; - int16x8x2_t vsrcv2 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st ); + int16x8_t vsrcv2 = filter8xX_N8_neon( src, ch, voffset1, invshift1st ); src += srcStride; - int16x8x2_t vsrcv3 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st ); + int16x8_t vsrcv3 = filter8xX_N8_neon( src, ch, voffset1, invshift1st ); src += srcStride; - int16x8x2_t vsrcv4 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st ); + int16x8_t vsrcv4 = filter8xX_N8_neon( src, ch, voffset1, invshift1st ); src += srcStride; - int16x8x2_t vsrcv5 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st ); + int16x8_t vsrcv5 = filter8xX_N8_neon( src, ch, voffset1, invshift1st ); src += srcStride; - int16x8x2_t vsrcv6 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st ); + int16x8_t vsrcv6 = filter8xX_N8_neon( src, ch, voffset1, invshift1st ); src += srcStride; do { - int16x8x2_t vsrcv7 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st ); + int16x8_t vsrcv7 = filter8xX_N8_neon( src, ch, voffset1, invshift1st ); + src += srcStride; + + int32x4_t vsum0 = vdupq_n_s32( offset2nd ); + int32x4_t vsum1 = vdupq_n_s32( offset2nd ); + + vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv0 ), cv, 0 ); + vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv0 ), cv, 0 ); + + vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv1 ), cv, 1 ); + vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv1 ), cv, 1 ); + + vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv2 ), cv, 2 ); + vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv2 ), cv, 2 ); + + vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv3 ), cv, 3 ); + vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv3 ), cv, 3 ); + + vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv4 ), cv, 4 ); + vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv4 ), cv, 4 ); + + vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv5 ), cv, 5 ); + vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv5 ), cv, 5 ); + + vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv6 ), cv, 6 ); + vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv6 ), cv, 6 ); + + vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv7 ), cv, 7 ); + vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv7 ), cv, 7 ); + + int16x8_t vsum01; + if( isLast ) // clip + { + vsum0 = vshlq_s32( vsum0, invshift2nd ); + vsum1 = vshlq_s32( vsum1, invshift2nd ); + + vsum01 = vcombine_s16( vqmovn_s32( vsum0 ), vqmovn_s32( vsum1 ) ); + vsum01 = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum01 ) ); + } + else + { + vsum01 = vcombine_s16( vqshrn_n_s32( vsum0, IF_FILTER_PREC ), vqshrn_n_s32( vsum1, IF_FILTER_PREC ) ); + } + + vsrcv0 = vsrcv1; + vsrcv1 = vsrcv2; + vsrcv2 = vsrcv3; + vsrcv3 = vsrcv4; + vsrcv4 = vsrcv5; + vsrcv5 = vsrcv6; + vsrcv6 = vsrcv7; + + vst1q_s16( dst, vsum01 ); + dst += dstStride; + } while( --height != 0 ); +} + +template +static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride, + int width, int height, TFilterCoeff const* coeffH, TFilterCoeff const* coeffV ) +{ + OFFSET( src, srcStride, -3, -3 ); + + // With the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be + // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20. + const int headRoom = std::max( 2, ( IF_INTERNAL_PREC - clpRng.bd ) ); + const int shift1st = IF_FILTER_PREC - headRoom; + const int shift2nd = IF_FILTER_PREC + headRoom; + + const int offset1st = -IF_INTERNAL_OFFS * ( 1 << shift1st ); + int offset2nd; + if( isLast ) + { + offset2nd = ( 1 << ( shift2nd - 1 ) ) + ( IF_INTERNAL_OFFS << IF_FILTER_PREC ); + } + else + { + offset2nd = 0; + } + const int32x4_t voffset1 = vdupq_n_s32( offset1st ); + + const int16x8_t vibdimin = vdupq_n_s16( clpRng.min() ); + const int16x8_t vibdimax = vdupq_n_s16( clpRng.max() ); + + int16x8_t ch = vld1q_s16( coeffH ); + int16x8_t cv = vld1q_s16( coeffV ); + + int32x4_t invshift1st = vdupq_n_s32( -shift1st ); + int32x4_t invshift2nd = vdupq_n_s32( -shift2nd ); + + int16x8x2_t vsrcv0 = filter16xX_N8_neon( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x8x2_t vsrcv1 = filter16xX_N8_neon( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x8x2_t vsrcv2 = filter16xX_N8_neon( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x8x2_t vsrcv3 = filter16xX_N8_neon( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x8x2_t vsrcv4 = filter16xX_N8_neon( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x8x2_t vsrcv5 = filter16xX_N8_neon( src, ch, voffset1, invshift1st ); + src += srcStride; + int16x8x2_t vsrcv6 = filter16xX_N8_neon( src, ch, voffset1, invshift1st ); + src += srcStride; + + do + { + int16x8x2_t vsrcv7 = filter16xX_N8_neon( src, ch, voffset1, invshift1st ); src += srcStride; int32x4_t vsum0 = vdupq_n_s32( offset2nd ); @@ -318,6 +499,12 @@ static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const *src, const template<> void InterpolationFilter::_initInterpolationFilterARM() { + m_filter4x4[ 0 ][ 0 ] = simdFilter4xX_N8_neon; + m_filter4x4[ 0 ][ 1 ] = simdFilter4xX_N8_neon; + + m_filter8x8[ 0 ][ 0 ] = simdFilter8xX_N8_neon; + m_filter8x8[ 0 ][ 1 ] = simdFilter8xX_N8_neon; + m_filter16x16[ 0 ][ 0 ] = simdFilter16xX_N8_neon; m_filter16x16[ 0 ][ 1 ] = simdFilter16xX_N8_neon;