Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PR#444 from vvenc porting. #209

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
287 changes: 237 additions & 50 deletions source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,63 +133,51 @@ static void simdInterpolateN2_2D_neon( const ClpRng& clpRng, const Pel* src, con
}
}

static int16x8_t simdFilter16xX_N8_half( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st )
static int16x4_t filter4xX_N8_neon( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st )
{
int16x8_t vsrca00 = vld1q_s16( src + 0 );
int16x8_t vsrca01 = vld1q_s16( src + 1 );
int16x8_t vsrca10 = vld1q_s16( src + 2 );
int16x8_t vsrca11 = vld1q_s16( src + 3 );
int16x8_t vsrcb00 = vld1q_s16( src + 4 );
int16x8_t vsrcb01 = vld1q_s16( src + 5 );
int16x8_t vsrcb10 = vld1q_s16( src + 6 );
int16x8_t vsrcb11 = vld1q_s16( src + 7 );

int32x4_t a0 = vmull_s16( vget_low_s16( vsrca00 ), vget_low_s16( ch ) );
int32x4_t a1 = vmull_s16( vget_low_s16( vsrca01 ), vget_low_s16( ch ) );
int32x4_t a2 = vmull_s16( vget_low_s16( vsrca10 ), vget_low_s16( ch ) );
int32x4_t a3 = vmull_s16( vget_low_s16( vsrca11 ), vget_low_s16( ch ) );

int32x4_t b0 = vmull_s16( vget_low_s16( vsrcb00 ), vget_low_s16( ch ) );
int32x4_t b1 = vmull_s16( vget_low_s16( vsrcb01 ), vget_low_s16( ch ) );
int32x4_t b2 = vmull_s16( vget_low_s16( vsrcb10 ), vget_low_s16( ch ) );
int32x4_t b3 = vmull_s16( vget_low_s16( vsrcb11 ), vget_low_s16( ch ) );

a0 = vmlal_s16( a0, vget_high_s16( vsrca00 ), vget_high_s16( ch ) );
a1 = vmlal_s16( a1, vget_high_s16( vsrca01 ), vget_high_s16( ch ) );
a2 = vmlal_s16( a2, vget_high_s16( vsrca10 ), vget_high_s16( ch ) );
a3 = vmlal_s16( a3, vget_high_s16( vsrca11 ), vget_high_s16( ch ) );

b0 = vmlal_s16( b0, vget_high_s16( vsrcb00 ), vget_high_s16( ch ) );
b1 = vmlal_s16( b1, vget_high_s16( vsrcb01 ), vget_high_s16( ch ) );
b2 = vmlal_s16( b2, vget_high_s16( vsrcb10 ), vget_high_s16( ch ) );
b3 = vmlal_s16( b3, vget_high_s16( vsrcb11 ), vget_high_s16( ch ) );
int16x8_t vsrca0 = vld1q_s16( src + 0 );
int16x8_t vsrca1 = vld1q_s16( src + 1 );
int16x8_t vsrca2 = vld1q_s16( src + 2 );
int16x8_t vsrca3 = vld1q_s16( src + 3 );

int32x4_t vsuma = vpaddq_s32( vpaddq_s32( a0, a1 ), vpaddq_s32( a2, a3 ) );
int32x4_t vsumb = vpaddq_s32( vpaddq_s32( b0, b1 ), vpaddq_s32( b2, b3 ) );
int32x4_t a0 = vmull_s16( vget_low_s16( vsrca0 ), vget_low_s16( ch ) );
int32x4_t a1 = vmull_s16( vget_low_s16( vsrca1 ), vget_low_s16( ch ) );
int32x4_t a2 = vmull_s16( vget_low_s16( vsrca2 ), vget_low_s16( ch ) );
int32x4_t a3 = vmull_s16( vget_low_s16( vsrca3 ), vget_low_s16( ch ) );

vsuma = vaddq_s32( vsuma, voffset1 );
vsumb = vaddq_s32( vsumb, voffset1 );
a0 = vmlal_s16( a0, vget_high_s16( vsrca0 ), vget_high_s16( ch ) );
a1 = vmlal_s16( a1, vget_high_s16( vsrca1 ), vget_high_s16( ch ) );
a2 = vmlal_s16( a2, vget_high_s16( vsrca2 ), vget_high_s16( ch ) );
a3 = vmlal_s16( a3, vget_high_s16( vsrca3 ), vget_high_s16( ch ) );

vsuma = vshlq_s32( vsuma, invshift1st );
vsumb = vshlq_s32( vsumb, invshift1st );
int32x4_t vsuma = vpaddq_s32( vpaddq_s32( a0, a1 ), vpaddq_s32( a2, a3 ) );
vsuma = vaddq_s32( vsuma, voffset1 );
vsuma = vshlq_s32( vsuma, invshift1st );
return vqmovn_s32( vsuma );
}

return vcombine_s16( vqmovn_s32( vsuma ), vqmovn_s32( vsumb ) );
static int16x8_t filter8xX_N8_neon( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st )
{
int16x4_t lo = filter4xX_N8_neon( src + 0, ch, voffset1, invshift1st );
int16x4_t hi = filter4xX_N8_neon( src + 4, ch, voffset1, invshift1st );
return vcombine_s16( lo, hi );
}

static int16x8x2_t simdFilter16xX_N8_step( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st )
static int16x8x2_t filter16xX_N8_neon( Pel const* src, int16x8_t ch, int32x4_t voffset1, int32x4_t invshift1st )
{
int16x8_t a = simdFilter16xX_N8_half( src + 0, ch, voffset1, invshift1st );
int16x8_t b = simdFilter16xX_N8_half( src + 8, ch, voffset1, invshift1st );
int16x8_t a = filter8xX_N8_neon( src + 0, ch, voffset1, invshift1st );
int16x8_t b = filter8xX_N8_neon( src + 8, ch, voffset1, invshift1st );
return ( int16x8x2_t ){ a, b };
}

template<bool isLast>
static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const *src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride, int width, int height, TFilterCoeff const *coeffH, TFilterCoeff const *coeffV )
static void simdFilter4xX_N8_neon( const ClpRng& clpRng, Pel const* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride,
int width, int height, TFilterCoeff const* coeffH, TFilterCoeff const* coeffV )
{
OFFSET( src, srcStride, -3, -3 );

// with the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be
// negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20
// With the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be
// negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20.
const int headRoom = std::max<int>( 2, ( IF_INTERNAL_PREC - clpRng.bd ) );
const int shift1st = IF_FILTER_PREC - headRoom;
const int shift2nd = IF_FILTER_PREC + headRoom;
Expand All @@ -204,7 +192,93 @@ static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const *src, const
{
offset2nd = 0;
}
const int32x4_t voffset1 = vdupq_n_s32( offset1st );

const int16x4_t vibdimin = vdup_n_s16( clpRng.min() );
const int16x4_t vibdimax = vdup_n_s16( clpRng.max() );

int16x8_t ch = vld1q_s16( coeffH );
int16x8_t cv = vld1q_s16( coeffV );

int32x4_t invshift1st = vdupq_n_s32( -shift1st );
int32x4_t invshift2nd = vdupq_n_s32( -shift2nd );

int16x4_t vsrcv0 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x4_t vsrcv1 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x4_t vsrcv2 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x4_t vsrcv3 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x4_t vsrcv4 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x4_t vsrcv5 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x4_t vsrcv6 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;

do
{
int16x4_t vsrcv7 = filter4xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;

int32x4_t vsum0 = vdupq_n_s32( offset2nd );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv0, cv, 0 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv1, cv, 1 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv2, cv, 2 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv3, cv, 3 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv4, cv, 4 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv5, cv, 5 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv6, cv, 6 );
vsum0 = vmlal_laneq_s16( vsum0, vsrcv7, cv, 7 );

int16x4_t vsum01;
if( isLast ) // clip
{
vsum01 = vqmovn_s32( vshlq_s32( vsum0, invshift2nd ) );
vsum01 = vmin_s16( vibdimax, vmax_s16( vibdimin, vsum01 ) );
}
else
{
vsum01 = vqshrn_n_s32( vsum0, IF_FILTER_PREC );
}

vsrcv0 = vsrcv1;
vsrcv1 = vsrcv2;
vsrcv2 = vsrcv3;
vsrcv3 = vsrcv4;
vsrcv4 = vsrcv5;
vsrcv5 = vsrcv6;
vsrcv6 = vsrcv7;

vst1_s16( dst, vsum01 );
dst += dstStride;
} while( --height != 0 );
}

template<bool isLast>
static void simdFilter8xX_N8_neon( const ClpRng& clpRng, Pel const* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride,
int width, int height, TFilterCoeff const* coeffH, TFilterCoeff const* coeffV )
{
OFFSET( src, srcStride, -3, -3 );

// With the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be
// negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20.
const int headRoom = std::max<int>( 2, ( IF_INTERNAL_PREC - clpRng.bd ) );
const int shift1st = IF_FILTER_PREC - headRoom;
const int shift2nd = IF_FILTER_PREC + headRoom;

const int offset1st = -IF_INTERNAL_OFFS * ( 1 << shift1st );
int offset2nd;
if( isLast )
{
offset2nd = ( 1 << ( shift2nd - 1 ) ) + ( IF_INTERNAL_OFFS << IF_FILTER_PREC );
}
else
{
offset2nd = 0;
}
const int32x4_t voffset1 = vdupq_n_s32( offset1st );

const int16x8_t vibdimin = vdupq_n_s16( clpRng.min() );
Expand All @@ -216,24 +290,131 @@ static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const *src, const
int32x4_t invshift1st = vdupq_n_s32( -shift1st );
int32x4_t invshift2nd = vdupq_n_s32( -shift2nd );

int16x8x2_t vsrcv0 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
int16x8_t vsrcv0 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv1 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
int16x8_t vsrcv1 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv2 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
int16x8_t vsrcv2 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv3 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
int16x8_t vsrcv3 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv4 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
int16x8_t vsrcv4 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv5 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
int16x8_t vsrcv5 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv6 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
int16x8_t vsrcv6 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;

do
{
int16x8x2_t vsrcv7 = simdFilter16xX_N8_step( src, ch, voffset1, invshift1st );
int16x8_t vsrcv7 = filter8xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;

int32x4_t vsum0 = vdupq_n_s32( offset2nd );
int32x4_t vsum1 = vdupq_n_s32( offset2nd );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv0 ), cv, 0 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv0 ), cv, 0 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv1 ), cv, 1 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv1 ), cv, 1 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv2 ), cv, 2 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv2 ), cv, 2 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv3 ), cv, 3 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv3 ), cv, 3 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv4 ), cv, 4 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv4 ), cv, 4 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv5 ), cv, 5 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv5 ), cv, 5 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv6 ), cv, 6 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv6 ), cv, 6 );

vsum0 = vmlal_laneq_s16( vsum0, vget_low_s16( vsrcv7 ), cv, 7 );
vsum1 = vmlal_laneq_s16( vsum1, vget_high_s16( vsrcv7 ), cv, 7 );

int16x8_t vsum01;
if( isLast ) // clip
{
vsum0 = vshlq_s32( vsum0, invshift2nd );
vsum1 = vshlq_s32( vsum1, invshift2nd );

vsum01 = vcombine_s16( vqmovn_s32( vsum0 ), vqmovn_s32( vsum1 ) );
vsum01 = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum01 ) );
}
else
{
vsum01 = vcombine_s16( vqshrn_n_s32( vsum0, IF_FILTER_PREC ), vqshrn_n_s32( vsum1, IF_FILTER_PREC ) );
}

vsrcv0 = vsrcv1;
vsrcv1 = vsrcv2;
vsrcv2 = vsrcv3;
vsrcv3 = vsrcv4;
vsrcv4 = vsrcv5;
vsrcv5 = vsrcv6;
vsrcv6 = vsrcv7;

vst1q_s16( dst, vsum01 );
dst += dstStride;
} while( --height != 0 );
}

template<bool isLast>
static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const* src, const ptrdiff_t srcStride, Pel* dst, const ptrdiff_t dstStride,
int width, int height, TFilterCoeff const* coeffH, TFilterCoeff const* coeffV )
{
OFFSET( src, srcStride, -3, -3 );

// With the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be
// negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20.
const int headRoom = std::max<int>( 2, ( IF_INTERNAL_PREC - clpRng.bd ) );
const int shift1st = IF_FILTER_PREC - headRoom;
const int shift2nd = IF_FILTER_PREC + headRoom;

const int offset1st = -IF_INTERNAL_OFFS * ( 1 << shift1st );
int offset2nd;
if( isLast )
{
offset2nd = ( 1 << ( shift2nd - 1 ) ) + ( IF_INTERNAL_OFFS << IF_FILTER_PREC );
}
else
{
offset2nd = 0;
}
const int32x4_t voffset1 = vdupq_n_s32( offset1st );

const int16x8_t vibdimin = vdupq_n_s16( clpRng.min() );
const int16x8_t vibdimax = vdupq_n_s16( clpRng.max() );

int16x8_t ch = vld1q_s16( coeffH );
int16x8_t cv = vld1q_s16( coeffV );

int32x4_t invshift1st = vdupq_n_s32( -shift1st );
int32x4_t invshift2nd = vdupq_n_s32( -shift2nd );

int16x8x2_t vsrcv0 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv1 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv2 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv3 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv4 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv5 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;
int16x8x2_t vsrcv6 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;

do
{
int16x8x2_t vsrcv7 = filter16xX_N8_neon( src, ch, voffset1, invshift1st );
src += srcStride;

int32x4_t vsum0 = vdupq_n_s32( offset2nd );
Expand Down Expand Up @@ -318,6 +499,12 @@ static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const *src, const
template<>
void InterpolationFilter::_initInterpolationFilterARM<NEON>()
{
m_filter4x4[ 0 ][ 0 ] = simdFilter4xX_N8_neon<false>;
m_filter4x4[ 0 ][ 1 ] = simdFilter4xX_N8_neon<true>;

m_filter8x8[ 0 ][ 0 ] = simdFilter8xX_N8_neon<false>;
m_filter8x8[ 0 ][ 1 ] = simdFilter8xX_N8_neon<true>;

m_filter16x16[ 0 ][ 0 ] = simdFilter16xX_N8_neon<false>;
m_filter16x16[ 0 ][ 1 ] = simdFilter16xX_N8_neon<true>;

Expand Down