From 19a7a1a2f0eee27aa6fac93f1fe36ad89c1b59ef Mon Sep 17 00:00:00 2001 From: nihuini Date: Mon, 27 Nov 2023 17:11:26 +0800 Subject: [PATCH] fix build with msvc arm64 asimdhp --- CMakeLists.txt | 58 ++--- src/layer/arm/arm_activation.h | 28 ++- src/layer/arm/arm_usability.h | 109 ++++++++ src/layer/arm/batchnorm_arm_asimdhp.cpp | 16 +- src/layer/arm/binaryop_arm_asimdhp.cpp | 1 + src/layer/arm/clip_arm_asimdhp.cpp | 1 + src/layer/arm/convolution1d_arm.cpp | 2 +- src/layer/arm/convolution1d_packed_fp16s.h | 10 +- .../arm/convolution_3x3_winograd_fp16s.h | 232 +++++++++--------- src/layer/arm/convolution_arm_asimdhp.cpp | 14 ++ src/layer/arm/convolution_packed_fp16s.h | 10 +- .../arm/convolutiondepthwise_arm_asimdhp.cpp | 6 +- src/layer/arm/deconvolution_arm_asimdhp.cpp | 18 +- .../deconvolutiondepthwise_arm_asimdhp.cpp | 6 +- src/layer/arm/dequantize_arm_asimdhp.cpp | 1 + src/layer/arm/eltwise_arm_asimdhp.cpp | 1 + src/layer/arm/gelu_arm_asimdhp.cpp | 5 +- src/layer/arm/gemm_arm_asimdhp.cpp | 2 +- src/layer/arm/hardsigmoid_arm_asimdhp.cpp | 1 + src/layer/arm/hardswish_arm_asimdhp.cpp | 1 + src/layer/arm/innerproduct_arm_asimdhp.cpp | 64 ++--- src/layer/arm/instancenorm_arm_asimdhp.cpp | 14 +- src/layer/arm/interp_arm_asimdhp.cpp | 1 + src/layer/arm/interp_bicubic_fp16s.h | 6 +- src/layer/arm/mish_arm_asimdhp.cpp | 9 +- src/layer/arm/neon_mathfun_fp16s.h | 89 +++++-- src/layer/arm/pooling_arm_asimdhp.cpp | 23 +- src/layer/arm/prelu_arm_asimdhp.cpp | 9 + src/layer/arm/relu_arm_asimdhp.cpp | 1 + src/layer/arm/sigmoid_arm_asimdhp.cpp | 17 +- src/layer/arm/softmax_arm_asimdhp.cpp | 57 ++--- src/layer/arm/swish_arm_asimdhp.cpp | 11 +- src/layer/arm/tanh_arm_asimdhp.cpp | 7 +- src/layer/arm/unaryop_arm_asimdhp.cpp | 27 +- 34 files changed, 548 insertions(+), 309 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4404b5a9024..86ca5f6e6e2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -183,35 +183,35 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm") set(CMAKE_REQUIRED_FLAGS "/arch:armv8.0") check_cxx_source_compiles("#include \nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4) - # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2") - # check_cxx_source_compiles("#include \nint main() { float16x8_t _s, _a, _b; _s = vfmaq_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16) - # - # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2") - # check_cxx_source_compiles("#include \nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD) - # - # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2") - # check_cxx_source_compiles("#include \nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML) - # - # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4") - # check_cxx_source_compiles("#include \nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16) - # - # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4") - # check_cxx_source_compiles("#include \nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM) - # - # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") - # check_cxx_source_compiles("#include \nint main() { svfloat16_t _s, _a, _b; svbool_t bp; _s = svmla_f16_z(bp, _s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE) - # - # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") - # check_cxx_source_compiles("#include \nint main() { svint16_t _s; svint8_t _a, _b; _s = svmlslb_s16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE2) - # - # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") - # check_cxx_source_compiles("#include \nint main() { svfloat32_t _s; svbfloat16_t _a, _b; _s = svbfmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16) - # - # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") - # check_cxx_source_compiles("#include \nint main() { svint32_t _s; svint8_t _a, _b; _s = svmmla_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM) - # - # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") - # check_cxx_source_compiles("#include \nint main() { svfloat32_t _s, _a, _b; _s = svmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM) + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2") + check_cxx_source_compiles("#include \nint main() { float16x8_t _s, _a, _b; _s = vfmaq_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16) + + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2") + check_cxx_source_compiles("#include \nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD) + + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2") + check_cxx_source_compiles("#include \nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML) + + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4") + check_cxx_source_compiles("#include \nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16) + + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4") + check_cxx_source_compiles("#include \nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM) + + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") + check_cxx_source_compiles("#include \nint main() { svfloat16_t _s, _a, _b; svbool_t bp; _s = svmla_f16_z(bp, _s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE) + + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") + check_cxx_source_compiles("#include \nint main() { svint16_t _s; svint8_t _a, _b; _s = svmlslb_s16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE2) + + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") + check_cxx_source_compiles("#include \nint main() { svfloat32_t _s; svbfloat16_t _a, _b; _s = svbfmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16) + + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") + check_cxx_source_compiles("#include \nint main() { svint32_t _s; svint8_t _a, _b; _s = svmmla_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM) + + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") + check_cxx_source_compiles("#include \nint main() { svfloat32_t _s, _a, _b; _s = svmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM) unset(CMAKE_REQUIRED_FLAGS) else() diff --git a/src/layer/arm/arm_activation.h b/src/layer/arm/arm_activation.h index d1407e8f165..aca3e57479f 100644 --- a/src/layer/arm/arm_activation.h +++ b/src/layer/arm/arm_activation.h @@ -68,9 +68,10 @@ static inline float32x4_t activation_ps(float32x4_t _v, int activation_type, con } #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#include "arm_usability.h" #include "neon_mathfun_fp16s.h" -static inline __fp16 activation_ss(__fp16 v, int activation_type, const ncnn::Mat& activation_params) +static inline __fp16 activation_ss_f16(__fp16 v, int activation_type, const ncnn::Mat& activation_params) { if (activation_type == 1) { @@ -92,11 +93,11 @@ static inline __fp16 activation_ss(__fp16 v, int activation_type, const ncnn::Ma } else if (activation_type == 4) { - v = (__fp16)1.f / ((__fp16)1.f + expf(-v)); + v = (__fp16)1.f / ((__fp16)1.f + (__fp16)expf(-v)); } else if (activation_type == 5) { - v = v * tanhf(logf(expf(v) + (__fp16)1.f)); + v = v * (__fp16)tanhf(logf(expf((float)v) + 1.f)); } else if (activation_type == 6) { @@ -115,7 +116,7 @@ static inline __fp16 activation_ss(__fp16 v, int activation_type, const ncnn::Ma return v; } -static inline float16x4_t activation_ps(float16x4_t _v, int activation_type, const ncnn::Mat& activation_params) +static inline float16x4_t activation_ps_f16(float16x4_t _v, int activation_type, const ncnn::Mat& activation_params) { if (activation_type == 1) { @@ -125,7 +126,11 @@ static inline float16x4_t activation_ps(float16x4_t _v, int activation_type, con else if (activation_type == 2) { const float16x4_t _zero = vdup_n_f16(0.f); +#if _MSC_VER + const float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(activation_params[0])); +#else const float16x4_t _slope = vdup_n_f16((__fp16)activation_params[0]); +#endif const uint16x4_t _lemask = vcle_f16(_v, _zero); float16x4_t _ps = vmul_f16(_v, _slope); _v = vbsl_f16(_lemask, _ps, _v); @@ -139,11 +144,11 @@ static inline float16x4_t activation_ps(float16x4_t _v, int activation_type, con } else if (activation_type == 4) { - _v = sigmoid_ps(_v); + _v = sigmoid_ps_f16(_v); } else if (activation_type == 5) { - _v = vmul_f16(_v, tanh_ps(log_ps(vadd_f16(exp_ps(_v), vdup_n_f16(1.f))))); + _v = vmul_f16(_v, tanh_ps_f16(log_ps_f16(vadd_f16(exp_ps_f16(_v), vdup_n_f16(1.f))))); } else if (activation_type == 6) { @@ -161,7 +166,7 @@ static inline float16x4_t activation_ps(float16x4_t _v, int activation_type, con return _v; } -static inline float16x8_t activation_ps(float16x8_t _v, int activation_type, const ncnn::Mat& activation_params) +static inline float16x8_t activation_ps_f16(float16x8_t _v, int activation_type, const ncnn::Mat& activation_params) { if (activation_type == 1) { @@ -171,7 +176,12 @@ static inline float16x8_t activation_ps(float16x8_t _v, int activation_type, con else if (activation_type == 2) { const float16x8_t _zero = vdupq_n_f16(0.f); +#if _MSC_VER + const float16x4_t _slope0 = vcvt_f16_f32(vdupq_n_f32(activation_params[0])); + const float16x8_t _slope = vcombine_f16(_slope0, _slope0); +#else const float16x8_t _slope = vdupq_n_f16((__fp16)activation_params[0]); +#endif const uint16x8_t _lemask = vcleq_f16(_v, _zero); float16x8_t _ps = vmulq_f16(_v, _slope); _v = vbslq_f16(_lemask, _ps, _v); @@ -185,11 +195,11 @@ static inline float16x8_t activation_ps(float16x8_t _v, int activation_type, con } else if (activation_type == 4) { - _v = sigmoid_ps(_v); + _v = sigmoid_ps_f16(_v); } else if (activation_type == 5) { - _v = vmulq_f16(_v, tanh_ps(log_ps(vaddq_f16(exp_ps(_v), vdupq_n_f16(1.f))))); + _v = vmulq_f16(_v, tanh_ps_f16(log_ps_f16(vaddq_f16(exp_ps_f16(_v), vdupq_n_f16(1.f))))); } else if (activation_type == 6) { diff --git a/src/layer/arm/arm_usability.h b/src/layer/arm/arm_usability.h index eb042d8ef34..4cd029caeed 100644 --- a/src/layer/arm/arm_usability.h +++ b/src/layer/arm/arm_usability.h @@ -123,6 +123,115 @@ static inline int8x8_t float2int8leakyrelu(float32x4_t _vlow, float32x4_t _vhigh } #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifdef _MSC_VER +struct __fp16 +{ + __fp16() + { + _u16 = 0; + } + + __fp16(float f32) + { + _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0); + } + + __fp16(__n16 n16) + { + _u16 = n16.n16_u16[0]; + } + + operator const float() const + { + return vgetq_lane_f32(vcvt_f32_f16(vreinterpretq_f16_u16(vdup_n_u16(_u16))), 0); + } + + __fp16& operator+=(const __fp16& b) + { + float a = (float)*this; + float f32 = (a + (float)b); + _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0); + return *this; + } + + __fp16& operator-=(const __fp16& b) + { + float a = (float)*this; + float f32 = (a - (float)b); + _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0); + return *this; + } + + __fp16& operator*=(const __fp16& b) + { + float a = (float)*this; + float f32 = (a * (float)b); + _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0); + return *this; + } + + __fp16& operator/=(const __fp16& b) + { + float a = (float)*this; + float f32 = (a / (float)b); + _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0); + return *this; + } + + unsigned short _u16; +}; + +static inline __fp16 operator-(const __fp16& a) { return __fp16(-(float)a); } +static inline __fp16 operator+(const __fp16& a, const __fp16& b) { return __fp16((float)a + (float)b); } +static inline __fp16 operator-(const __fp16& a, const __fp16& b) { return __fp16((float)a - (float)b); } +static inline __fp16 operator*(const __fp16& a, const __fp16& b) { return __fp16((float)a * (float)b); } +static inline __fp16 operator/(const __fp16& a, const __fp16& b) { return __fp16((float)a / (float)b); } + +static inline float16x4_t vdup_n_f16(const __fp16& f16) +{ + return vreinterpret_f16_u16(vdup_n_u16(f16._u16)); +} + +static inline float16x8_t vdupq_n_f16(const __fp16& f16) +{ + return vreinterpretq_f16_u16(vdupq_n_u16(f16._u16)); +} + +static inline __fp16 vmaxv_f16(float16x4_t a) +{ + return __fp16(vmaxvq_f32(vcvt_f32_f16(a))); +} + +static inline __fp16 vmaxvq_f16(float16x8_t a) +{ + return __fp16(vmaxvq_f32(vcvt_f32_f16(vget_low_f16(a))) + vmaxvq_f32(vcvt_f32_f16(vget_high_f16(a)))); +} + +#define vld1q_f16 vld1q_u16 +#define vst1q_f16 vst1q_u16 + +#define vld2_f16 vld2_u16 +#define vst2_f16 vst2_u16 + +#define vld2q_f16 vld2q_u16 +#define vst2q_f16 vst2q_u16 + +#define vld4_f16 vld4_u16 +#define vst4_f16 vst4_u16 + +#define vld4q_f16 vld4q_u16 +#define vst4q_f16 vst4q_u16 + +#define vld1q_dup_f16 vld1q_dup_u16 + +#define vset_lane_f16(x, v, i) vset_lane_u16(x._u16, (uint16x4_t)v, i) +#define vsetq_lane_f16(x, v, i) vsetq_lane_u16(x._u16, (uint16x8_t)v, i) + +#define vfma_n_f16(va, vb, x) vfma_f16(va, vb, vdup_n_f16(x)) +#define vfmaq_n_f16(va, vb, x) vfmaq_f16(va, vb, vdupq_n_f16(x)) + +#endif + static inline signed char float2int8(__fp16 v) { int int32 = round(v); diff --git a/src/layer/arm/batchnorm_arm_asimdhp.cpp b/src/layer/arm/batchnorm_arm_asimdhp.cpp index 2fce9b706d5..3bfc1dbafec 100644 --- a/src/layer/arm/batchnorm_arm_asimdhp.cpp +++ b/src/layer/arm/batchnorm_arm_asimdhp.cpp @@ -18,6 +18,8 @@ #include #endif // __ARM_NEON +#include "arm_usability.h" + namespace ncnn { #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC @@ -109,7 +111,7 @@ int BatchNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < w; i++) { - ptr[i] = b_data[i] * ptr[i] + a_data[i]; + ptr[i] = b_data[i] * (float)ptr[i] + a_data[i]; } } @@ -140,7 +142,7 @@ int BatchNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt } for (; j < w; j++) { - *ptr = b * *ptr + a; + *ptr = b * (float)*ptr + a; ptr++; } @@ -177,7 +179,7 @@ int BatchNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt } for (; j < size; j++) { - *ptr = b * *ptr + a; + *ptr = b * (float)*ptr + a; ptr++; } @@ -367,7 +369,11 @@ int BatchNorm_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op __fp16 b = (__fp16)b_data[i]; float16x4_t _a = vdup_n_f16(a); +#if _MSC_VER + float16x4_t _b = vcvt_f16_f32(vdupq_n_f32(b_data[i])); +#else float16x4_t _b = vdup_n_f16(b); +#endif int j = 0; for (; j + 3 < w; j += 4) @@ -404,7 +410,11 @@ int BatchNorm_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op __fp16 b = (__fp16)b_data[q]; float16x4_t _a = vdup_n_f16(a); +#if _MSC_VER + float16x4_t _b = vcvt_f16_f32(vdupq_n_f32(b_data[q])); +#else float16x4_t _b = vdup_n_f16(b); +#endif int j = 0; for (; j + 3 < size; j += 4) diff --git a/src/layer/arm/binaryop_arm_asimdhp.cpp b/src/layer/arm/binaryop_arm_asimdhp.cpp index b9a8ea2d00b..8c49cadb88c 100644 --- a/src/layer/arm/binaryop_arm_asimdhp.cpp +++ b/src/layer/arm/binaryop_arm_asimdhp.cpp @@ -17,6 +17,7 @@ #if __ARM_NEON #include #include "neon_mathfun.h" +#include "arm_usability.h" #endif // __ARM_NEON namespace ncnn { diff --git a/src/layer/arm/clip_arm_asimdhp.cpp b/src/layer/arm/clip_arm_asimdhp.cpp index 3e4dc7bcd64..55040c3b6bd 100644 --- a/src/layer/arm/clip_arm_asimdhp.cpp +++ b/src/layer/arm/clip_arm_asimdhp.cpp @@ -16,6 +16,7 @@ #ifdef __ARM_NEON #include +#include "arm_usability.h" #endif // __ARM_NEON namespace ncnn { diff --git a/src/layer/arm/convolution1d_arm.cpp b/src/layer/arm/convolution1d_arm.cpp index ab480aec62a..48368fb9cc6 100644 --- a/src/layer/arm/convolution1d_arm.cpp +++ b/src/layer/arm/convolution1d_arm.cpp @@ -18,8 +18,8 @@ #include #endif // __ARM_NEON -#include "arm_activation.h" #include "arm_usability.h" +#include "arm_activation.h" #include "cpu.h" #include "layer_type.h" diff --git a/src/layer/arm/convolution1d_packed_fp16s.h b/src/layer/arm/convolution1d_packed_fp16s.h index e55c45f573d..7b0c3e4975c 100644 --- a/src/layer/arm/convolution1d_packed_fp16s.h +++ b/src/layer/arm/convolution1d_packed_fp16s.h @@ -1378,7 +1378,7 @@ static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, c _sum2 = vaddq_f16(_sum2, _sum3); _sum0 = vaddq_f16(_sum0, _sum2); - _sum0 = activation_ps(_sum0, activation_type, activation_params); + _sum0 = activation_ps_f16(_sum0, activation_type, activation_params); if (out_elempack == 8) { @@ -1571,7 +1571,7 @@ static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, c _sum2 = vadd_f16(_sum2, _sum3); _sum0 = vadd_f16(_sum0, _sum2); - _sum0 = activation_ps(_sum0, activation_type, activation_params); + _sum0 = activation_ps_f16(_sum0, activation_type, activation_params); if (out_elempack == 4) { @@ -1737,8 +1737,8 @@ static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, c sum0 += vget_lane_f16(_ss, 0); sum1 += vget_lane_f16(_ss, 1); - sum0 = activation_ss(sum0, activation_type, activation_params); - sum1 = activation_ss(sum1, activation_type, activation_params); + sum0 = activation_ss_f16(sum0, activation_type, activation_params); + sum1 = activation_ss_f16(sum1, activation_type, activation_params); outptr0[0] = sum0; outptr1[0] = sum1; @@ -1874,7 +1874,7 @@ static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, c _ss = vpadd_f16(_ss, _ss); sum += vget_lane_f16(_ss, 0); - sum = activation_ss(sum, activation_type, activation_params); + sum = activation_ss_f16(sum, activation_type, activation_params); outptr[0] = sum; outptr += 1; diff --git a/src/layer/arm/convolution_3x3_winograd_fp16s.h b/src/layer/arm/convolution_3x3_winograd_fp16s.h index 813b81299dd..7332d61a798 100644 --- a/src/layer/arm/convolution_3x3_winograd_fp16s.h +++ b/src/layer/arm/convolution_3x3_winograd_fp16s.h @@ -3562,15 +3562,15 @@ static inline void conv3x3s1_winograd43_transform_input_tile_fp16sa(const Mat& b __fp16 tmp12a0 = sq2 * r10 + msq2_d2 * r30; __fp16 tmp12a1 = sq2 * r11 + msq2_d2 * r31; - __fp16 tmp12b0 = r40 - 2 * r20; - __fp16 tmp12b1 = r41 - 2 * r21; + __fp16 tmp12b0 = r40 - (__fp16)2.f * r20; + __fp16 tmp12b1 = r41 - (__fp16)2.f * r21; __fp16 tmp34a0 = sq2 * r30 + msq2_d2 * r10; __fp16 tmp34a1 = sq2 * r31 + msq2_d2 * r11; - __fp16 tmp34b0 = r40 - 0.5f * r20; - __fp16 tmp34b1 = r41 - 0.5f * r21; + __fp16 tmp34b0 = r40 - (__fp16)0.5f * r20; + __fp16 tmp34b1 = r41 - (__fp16)0.5f * r21; - tmp[0][m][0] = r00 + r40 - 2.5f * r20; - tmp[0][m][1] = r01 + r41 - 2.5f * r21; + tmp[0][m][0] = r00 + r40 - (__fp16)2.5f * r20; + tmp[0][m][1] = r01 + r41 - (__fp16)2.5f * r21; tmp[1][m][0] = tmp12b0 - tmp12a0; tmp[1][m][1] = tmp12b1 - tmp12a1; tmp[2][m][0] = tmp12b0 + tmp12a0; @@ -3579,8 +3579,8 @@ static inline void conv3x3s1_winograd43_transform_input_tile_fp16sa(const Mat& b tmp[3][m][1] = tmp34b1 + tmp34a1; tmp[4][m][0] = tmp34b0 - tmp34a0; tmp[4][m][1] = tmp34b1 - tmp34a1; - tmp[5][m][0] = r10 + r50 - 2.5f * r30; - tmp[5][m][1] = r11 + r51 - 2.5f * r31; + tmp[5][m][0] = r10 + r50 - (__fp16)2.5f * r30; + tmp[5][m][1] = r11 + r51 - (__fp16)2.5f * r31; r0 += w; } @@ -3609,15 +3609,15 @@ static inline void conv3x3s1_winograd43_transform_input_tile_fp16sa(const Mat& b __fp16 tmp12a0 = sq2 * r10 + msq2_d2 * r30; __fp16 tmp12a1 = sq2 * r11 + msq2_d2 * r31; - __fp16 tmp12b0 = r40 - 2 * r20; - __fp16 tmp12b1 = r41 - 2 * r21; + __fp16 tmp12b0 = r40 - (__fp16)2.f * r20; + __fp16 tmp12b1 = r41 - (__fp16)2.f * r21; __fp16 tmp34a0 = sq2 * r30 + msq2_d2 * r10; __fp16 tmp34a1 = sq2 * r31 + msq2_d2 * r11; - __fp16 tmp34b0 = r40 - 0.5f * r20; - __fp16 tmp34b1 = r41 - 0.5f * r21; + __fp16 tmp34b0 = r40 - (__fp16)0.5f * r20; + __fp16 tmp34b1 = r41 - (__fp16)0.5f * r21; - p0[0] = r00 + r40 - 2.5f * r20; - p0[1] = r01 + r41 - 2.5f * r21; + p0[0] = r00 + r40 - (__fp16)2.5f * r20; + p0[1] = r01 + r41 - (__fp16)2.5f * r21; p1[0] = tmp12b0 - tmp12a0; p1[1] = tmp12b1 - tmp12a1; p2[0] = tmp12b0 + tmp12a0; @@ -3626,8 +3626,8 @@ static inline void conv3x3s1_winograd43_transform_input_tile_fp16sa(const Mat& b p3[1] = tmp34b1 + tmp34a1; p4[0] = tmp34b0 - tmp34a0; p4[1] = tmp34b1 - tmp34a1; - p5[0] = r10 + r50 - 2.5f * r30; - p5[1] = r11 + r51 - 2.5f * r31; + p5[0] = r10 + r50 - (__fp16)2.5f * r30; + p5[1] = r11 + r51 - (__fp16)2.5f * r31; p0 += max_jj * 6 * 2; p1 += max_jj * 6 * 2; @@ -3674,16 +3674,16 @@ static inline void conv3x3s1_winograd43_transform_input_tile_fp16sa(const Mat& b } __fp16 tmp12a = sq2 * r1 + msq2_d2 * r3; - __fp16 tmp12b = r4 - 2 * r2; + __fp16 tmp12b = r4 - (__fp16)2.f * r2; __fp16 tmp34a = sq2 * r3 + msq2_d2 * r1; - __fp16 tmp34b = r4 - 0.5f * r2; + __fp16 tmp34b = r4 - (__fp16)0.5f * r2; - tmp[0][m] = r0 + r4 - 2.5f * r2; + tmp[0][m] = r0 + r4 - (__fp16)2.5f * r2; tmp[1][m] = tmp12b - tmp12a; tmp[2][m] = tmp12b + tmp12a; tmp[3][m] = tmp34b + tmp34a; tmp[4][m] = tmp34b - tmp34a; - tmp[5][m] = r1 + r5 - 2.5f * r3; + tmp[5][m] = r1 + r5 - (__fp16)2.5f * r3; r0123 += w; } @@ -3705,16 +3705,16 @@ static inline void conv3x3s1_winograd43_transform_input_tile_fp16sa(const Mat& b __fp16 r5 = tmp[m][5]; __fp16 tmp12a = sq2 * r1 + msq2_d2 * r3; - __fp16 tmp12b = r4 - 2 * r2; + __fp16 tmp12b = r4 - (__fp16)2.f * r2; __fp16 tmp34a = sq2 * r3 + msq2_d2 * r1; - __fp16 tmp34b = r4 - 0.5f * r2; + __fp16 tmp34b = r4 - (__fp16)0.5f * r2; - p0[0] = r0 + r4 - 2.5f * r2; + p0[0] = r0 + r4 - (__fp16)2.5f * r2; p1[0] = tmp12b - tmp12a; p2[0] = tmp12b + tmp12a; p3[0] = tmp34b + tmp34a; p4[0] = tmp34b - tmp34a; - p5[0] = r1 + r5 - 2.5f * r3; + p5[0] = r1 + r5 - (__fp16)2.5f * r3; p0 += max_jj * 6; p1 += max_jj * 6; @@ -4107,8 +4107,8 @@ static inline void conv3x3s1_winograd43_transform_output_tile_fp16sa(const Mat& tmp[0][m][1] = r0[1] + tmp02a1 + tmp02b1; tmp[1][m][0] = tmp13a0 * sq2_d2 + tmp13b0 * sq2; tmp[1][m][1] = tmp13a1 * sq2_d2 + tmp13b1 * sq2; - tmp[2][m][0] = tmp02a0 * 0.5f + tmp02b0 * 2; - tmp[2][m][1] = tmp02a1 * 0.5f + tmp02b1 * 2; + tmp[2][m][0] = tmp02a0 * (__fp16)0.5f + tmp02b0 * (__fp16)2; + tmp[2][m][1] = tmp02a1 * (__fp16)0.5f + tmp02b1 * (__fp16)2; tmp[3][m][0] = r5[0] + tmp13a0 * sq2_d4 + tmp13b0 * sq2_m2; tmp[3][m][1] = r5[1] + tmp13a1 * sq2_d4 + tmp13b1 * sq2_m2; @@ -4153,8 +4153,8 @@ static inline void conv3x3s1_winograd43_transform_output_tile_fp16sa(const Mat& __fp16 tmp01 = bias1 + r01 + tmp02a1 + tmp02b1; __fp16 tmp10 = bias0 + tmp13a0 * sq2_d2 + tmp13b0 * sq2; __fp16 tmp11 = bias1 + tmp13a1 * sq2_d2 + tmp13b1 * sq2; - __fp16 tmp20 = bias0 + tmp02a0 * 0.5f + tmp02b0 * 2; - __fp16 tmp21 = bias1 + tmp02a1 * 0.5f + tmp02b1 * 2; + __fp16 tmp20 = bias0 + tmp02a0 * (__fp16)0.5f + tmp02b0 * (__fp16)2; + __fp16 tmp21 = bias1 + tmp02a1 * (__fp16)0.5f + tmp02b1 * (__fp16)2; __fp16 tmp30 = bias0 + r50 + tmp13a0 * sq2_d4 + tmp13b0 * sq2_m2; __fp16 tmp31 = bias1 + r51 + tmp13a1 * sq2_d4 + tmp13b1 * sq2_m2; @@ -4213,7 +4213,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile_fp16sa(const Mat& tmp[0][m] = r0[0] + tmp02a + tmp02b; tmp[1][m] = tmp13a * sq2_d2 + tmp13b * sq2; - tmp[2][m] = tmp02a * 0.5f + tmp02b * 2; + tmp[2][m] = tmp02a * (__fp16)0.5f + tmp02b * (__fp16)2; tmp[3][m] = r5[0] + tmp13a * sq2_d4 + tmp13b * sq2_m2; r0 += max_jj * 6; @@ -4245,7 +4245,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile_fp16sa(const Mat& __fp16 tmp0 = bias0 + r0 + tmp02a + tmp02b; __fp16 tmp1 = bias0 + tmp13a * sq2_d2 + tmp13b * sq2; - __fp16 tmp2 = bias0 + tmp02a * 0.5f + tmp02b * 2; + __fp16 tmp2 = bias0 + tmp02a * (__fp16)0.5f + tmp02b * (__fp16)2; __fp16 tmp3 = bias0 + r5 + tmp13a * sq2_d4 + tmp13b * sq2_m2; // if (out_elempack == 1) @@ -4993,21 +4993,21 @@ static inline void conv3x3s1_winograd63_transform_input_tile_fp16sa(const Mat& b } } - __fp16 tmp12a0 = r20 + r60 - r40 * 4.25f; - __fp16 tmp12a1 = r21 + r61 - r41 * 4.25f; - __fp16 tmp12b0 = r10 + r50 - r30 * 4.25f; - __fp16 tmp12b1 = r11 + r51 - r31 * 4.25f; - __fp16 tmp34a0 = r60 + r20 * 0.25f - r40 * 1.25f; - __fp16 tmp34a1 = r61 + r21 * 0.25f - r41 * 1.25f; - __fp16 tmp34b0 = r10 * 0.5f - r30 * 2.5f + r50 * 2.f; - __fp16 tmp34b1 = r11 * 0.5f - r31 * 2.5f + r51 * 2.f; - __fp16 tmp56a0 = r20 * 4.f - r40 * 5.f + r60; - __fp16 tmp56a1 = r21 * 4.f - r41 * 5.f + r61; - __fp16 tmp56b0 = r10 * 2.f - r30 * 2.5f + r50 * 0.5f; - __fp16 tmp56b1 = r11 * 2.f - r31 * 2.5f + r51 * 0.5f; - - tmp[0][m][0] = r00 - r60 + (r40 - r20) * 5.25f; - tmp[0][m][1] = r01 - r61 + (r41 - r21) * 5.25f; + __fp16 tmp12a0 = r20 + r60 - r40 * (__fp16)4.25f; + __fp16 tmp12a1 = r21 + r61 - r41 * (__fp16)4.25f; + __fp16 tmp12b0 = r10 + r50 - r30 * (__fp16)4.25f; + __fp16 tmp12b1 = r11 + r51 - r31 * (__fp16)4.25f; + __fp16 tmp34a0 = r60 + r20 * (__fp16)0.25f - r40 * (__fp16)1.25f; + __fp16 tmp34a1 = r61 + r21 * (__fp16)0.25f - r41 * (__fp16)1.25f; + __fp16 tmp34b0 = r10 * (__fp16)0.5f - r30 * (__fp16)2.5f + r50 * (__fp16)2.f; + __fp16 tmp34b1 = r11 * (__fp16)0.5f - r31 * (__fp16)2.5f + r51 * (__fp16)2.f; + __fp16 tmp56a0 = r20 * (__fp16)4.f - r40 * (__fp16)5.f + r60; + __fp16 tmp56a1 = r21 * (__fp16)4.f - r41 * (__fp16)5.f + r61; + __fp16 tmp56b0 = r10 * (__fp16)2.f - r30 * (__fp16)2.5f + r50 * (__fp16)0.5f; + __fp16 tmp56b1 = r11 * (__fp16)2.f - r31 * (__fp16)2.5f + r51 * (__fp16)0.5f; + + tmp[0][m][0] = r00 - r60 + (r40 - r20) * (__fp16)5.25f; + tmp[0][m][1] = r01 - r61 + (r41 - r21) * (__fp16)5.25f; tmp[1][m][0] = tmp12a0 + tmp12b0; tmp[1][m][1] = tmp12a1 + tmp12b1; tmp[2][m][0] = tmp12a0 - tmp12b0; @@ -5020,8 +5020,8 @@ static inline void conv3x3s1_winograd63_transform_input_tile_fp16sa(const Mat& b tmp[5][m][1] = tmp56a1 + tmp56b1; tmp[6][m][0] = tmp56a0 - tmp56b0; tmp[6][m][1] = tmp56a1 - tmp56b1; - tmp[7][m][0] = r70 - r10 + (r30 - r50) * 5.25f; - tmp[7][m][1] = r71 - r11 + (r31 - r51) * 5.25f; + tmp[7][m][0] = r70 - r10 + (r30 - r50) * (__fp16)5.25f; + tmp[7][m][1] = r71 - r11 + (r31 - r51) * (__fp16)5.25f; r0 += w; } @@ -5054,21 +5054,21 @@ static inline void conv3x3s1_winograd63_transform_input_tile_fp16sa(const Mat& b __fp16 r70 = tmp[m][7][0]; __fp16 r71 = tmp[m][7][1]; - __fp16 tmp12a0 = r20 + r60 - r40 * 4.25f; - __fp16 tmp12a1 = r21 + r61 - r41 * 4.25f; - __fp16 tmp12b0 = r10 + r50 - r30 * 4.25f; - __fp16 tmp12b1 = r11 + r51 - r31 * 4.25f; - __fp16 tmp34a0 = r60 + r20 * 0.25f - r40 * 1.25f; - __fp16 tmp34a1 = r61 + r21 * 0.25f - r41 * 1.25f; - __fp16 tmp34b0 = r10 * 0.5f - r30 * 2.5f + r50 * 2.f; - __fp16 tmp34b1 = r11 * 0.5f - r31 * 2.5f + r51 * 2.f; - __fp16 tmp56a0 = r20 * 4.f - r40 * 5.f + r60; - __fp16 tmp56a1 = r21 * 4.f - r41 * 5.f + r61; - __fp16 tmp56b0 = r10 * 2.f - r30 * 2.5f + r50 * 0.5f; - __fp16 tmp56b1 = r11 * 2.f - r31 * 2.5f + r51 * 0.5f; - - p0[0] = r00 - r60 + (r40 - r20) * 5.25f; - p0[1] = r01 - r61 + (r41 - r21) * 5.25f; + __fp16 tmp12a0 = r20 + r60 - r40 * (__fp16)4.25f; + __fp16 tmp12a1 = r21 + r61 - r41 * (__fp16)4.25f; + __fp16 tmp12b0 = r10 + r50 - r30 * (__fp16)4.25f; + __fp16 tmp12b1 = r11 + r51 - r31 * (__fp16)4.25f; + __fp16 tmp34a0 = r60 + r20 * (__fp16)0.25f - r40 * (__fp16)1.25f; + __fp16 tmp34a1 = r61 + r21 * (__fp16)0.25f - r41 * (__fp16)1.25f; + __fp16 tmp34b0 = r10 * (__fp16)0.5f - r30 * (__fp16)2.5f + r50 * (__fp16)2.f; + __fp16 tmp34b1 = r11 * (__fp16)0.5f - r31 * (__fp16)2.5f + r51 * (__fp16)2.f; + __fp16 tmp56a0 = r20 * (__fp16)4.f - r40 * (__fp16)5.f + r60; + __fp16 tmp56a1 = r21 * (__fp16)4.f - r41 * (__fp16)5.f + r61; + __fp16 tmp56b0 = r10 * (__fp16)2.f - r30 * (__fp16)2.5f + r50 * (__fp16)0.5f; + __fp16 tmp56b1 = r11 * (__fp16)2.f - r31 * (__fp16)2.5f + r51 * (__fp16)0.5f; + + p0[0] = r00 - r60 + (r40 - r20) * (__fp16)5.25f; + p0[1] = r01 - r61 + (r41 - r21) * (__fp16)5.25f; p1[0] = tmp12a0 + tmp12b0; p1[1] = tmp12a1 + tmp12b1; p2[0] = tmp12a0 - tmp12b0; @@ -5081,8 +5081,8 @@ static inline void conv3x3s1_winograd63_transform_input_tile_fp16sa(const Mat& b p5[1] = tmp56a1 + tmp56b1; p6[0] = tmp56a0 - tmp56b0; p6[1] = tmp56a1 - tmp56b1; - p7[0] = r70 - r10 + (r30 - r50) * 5.25f; - p7[1] = r71 - r11 + (r31 - r51) * 5.25f; + p7[0] = r70 - r10 + (r30 - r50) * (__fp16)5.25f; + p7[1] = r71 - r11 + (r31 - r51) * (__fp16)5.25f; p0 += max_jj * 8 * 2; p1 += max_jj * 8 * 2; @@ -5134,21 +5134,21 @@ static inline void conv3x3s1_winograd63_transform_input_tile_fp16sa(const Mat& b } } - __fp16 tmp12a = r2 + r6 - r4 * 4.25f; - __fp16 tmp12b = r1 + r5 - r3 * 4.25f; - __fp16 tmp34a = r6 + r2 * 0.25f - r4 * 1.25f; - __fp16 tmp34b = r1 * 0.5f - r3 * 2.5f + r5 * 2.f; - __fp16 tmp56a = r2 * 4.f - r4 * 5.f + r6; - __fp16 tmp56b = r1 * 2.f - r3 * 2.5f + r5 * 0.5f; + __fp16 tmp12a = r2 + r6 - r4 * (__fp16)4.25f; + __fp16 tmp12b = r1 + r5 - r3 * (__fp16)4.25f; + __fp16 tmp34a = r6 + r2 * (__fp16)0.25f - r4 * (__fp16)1.25f; + __fp16 tmp34b = r1 * (__fp16)0.5f - r3 * (__fp16)2.5f + r5 * (__fp16)2.f; + __fp16 tmp56a = r2 * (__fp16)4.f - r4 * (__fp16)5.f + r6; + __fp16 tmp56b = r1 * (__fp16)2.f - r3 * (__fp16)2.5f + r5 * (__fp16)0.5f; - tmp[0][m] = r0 - r6 + (r4 - r2) * 5.25f; + tmp[0][m] = r0 - r6 + (r4 - r2) * (__fp16)5.25f; tmp[1][m] = tmp12a + tmp12b; tmp[2][m] = tmp12a - tmp12b; tmp[3][m] = tmp34a + tmp34b; tmp[4][m] = tmp34a - tmp34b; tmp[5][m] = tmp56a + tmp56b; tmp[6][m] = tmp56a - tmp56b; - tmp[7][m] = r7 - r1 + (r3 - r5) * 5.25f; + tmp[7][m] = r7 - r1 + (r3 - r5) * (__fp16)5.25f; r0123 += w; } @@ -5173,21 +5173,21 @@ static inline void conv3x3s1_winograd63_transform_input_tile_fp16sa(const Mat& b __fp16 r6 = tmp[m][6]; __fp16 r7 = tmp[m][7]; - __fp16 tmp12a = r2 + r6 - r4 * 4.25f; - __fp16 tmp12b = r1 + r5 - r3 * 4.25f; - __fp16 tmp34a = r6 + r2 * 0.25f - r4 * 1.25f; - __fp16 tmp34b = r1 * 0.5f - r3 * 2.5f + r5 * 2.f; - __fp16 tmp56a = r2 * 4.f - r4 * 5.f + r6; - __fp16 tmp56b = r1 * 2.f - r3 * 2.5f + r5 * 0.5f; + __fp16 tmp12a = r2 + r6 - r4 * (__fp16)4.25f; + __fp16 tmp12b = r1 + r5 - r3 * (__fp16)4.25f; + __fp16 tmp34a = r6 + r2 * (__fp16)0.25f - r4 * (__fp16)1.25f; + __fp16 tmp34b = r1 * (__fp16)0.5f - r3 * (__fp16)2.5f + r5 * (__fp16)2.f; + __fp16 tmp56a = r2 * (__fp16)4.f - r4 * (__fp16)5.f + r6; + __fp16 tmp56b = r1 * (__fp16)2.f - r3 * (__fp16)2.5f + r5 * (__fp16)0.5f; - p0[0] = r0 - r6 + (r4 - r2) * 5.25f; + p0[0] = r0 - r6 + (r4 - r2) * (__fp16)5.25f; p1[0] = tmp12a + tmp12b; p2[0] = tmp12a - tmp12b; p3[0] = tmp34a + tmp34b; p4[0] = tmp34a - tmp34b; p5[0] = tmp56a + tmp56b; p6[0] = tmp56a - tmp56b; - p7[0] = r7 - r1 + (r3 - r5) * 5.25f; + p7[0] = r7 - r1 + (r3 - r5) * (__fp16)5.25f; p0 += max_jj * 8; p1 += max_jj * 8; @@ -5670,18 +5670,18 @@ static inline void conv3x3s1_winograd63_transform_output_tile_fp16sa(const Mat& __fp16 tmp135c0 = r5[0] - r6[0]; __fp16 tmp135c1 = r5[1] - r6[1]; - tmp[0][m][0] = r0[0] + tmp024a0 + tmp024b0 + tmp024c0 * 32; - tmp[0][m][1] = r0[1] + tmp024a1 + tmp024b1 + tmp024c1 * 32; - tmp[1][m][0] = tmp135a0 + tmp135b0 + tmp135b0 + tmp135c0 * 16; - tmp[1][m][1] = tmp135a1 + tmp135b1 + tmp135b1 + tmp135c1 * 16; - tmp[2][m][0] = tmp024a0 + tmp024b0 * 4 + tmp024c0 * 8; - tmp[2][m][1] = tmp024a1 + tmp024b1 * 4 + tmp024c1 * 8; - tmp[3][m][0] = tmp135a0 + tmp135b0 * 8 + tmp135c0 * 4; - tmp[3][m][1] = tmp135a1 + tmp135b1 * 8 + tmp135c1 * 4; - tmp[4][m][0] = tmp024a0 + tmp024b0 * 16 + tmp024c0 + tmp024c0; - tmp[4][m][1] = tmp024a1 + tmp024b1 * 16 + tmp024c1 + tmp024c1; - tmp[5][m][0] = r7[0] + tmp135a0 + tmp135b0 * 32 + tmp135c0; - tmp[5][m][1] = r7[1] + tmp135a1 + tmp135b1 * 32 + tmp135c1; + tmp[0][m][0] = r0[0] + tmp024a0 + tmp024b0 + tmp024c0 * (__fp16)32; + tmp[0][m][1] = r0[1] + tmp024a1 + tmp024b1 + tmp024c1 * (__fp16)32; + tmp[1][m][0] = tmp135a0 + tmp135b0 + tmp135b0 + tmp135c0 * (__fp16)16; + tmp[1][m][1] = tmp135a1 + tmp135b1 + tmp135b1 + tmp135c1 * (__fp16)16; + tmp[2][m][0] = tmp024a0 + tmp024b0 * (__fp16)4 + tmp024c0 * (__fp16)8; + tmp[2][m][1] = tmp024a1 + tmp024b1 * (__fp16)4 + tmp024c1 * (__fp16)8; + tmp[3][m][0] = tmp135a0 + tmp135b0 * (__fp16)8 + tmp135c0 * (__fp16)4; + tmp[3][m][1] = tmp135a1 + tmp135b1 * (__fp16)8 + tmp135c1 * (__fp16)4; + tmp[4][m][0] = tmp024a0 + tmp024b0 * (__fp16)16 + tmp024c0 + tmp024c0; + tmp[4][m][1] = tmp024a1 + tmp024b1 * (__fp16)16 + tmp024c1 + tmp024c1; + tmp[5][m][0] = r7[0] + tmp135a0 + tmp135b0 * (__fp16)32 + tmp135c0; + tmp[5][m][1] = r7[1] + tmp135a1 + tmp135b1 * (__fp16)32 + tmp135c1; r0 += max_jj * 8 * 2; r1 += max_jj * 8 * 2; @@ -5730,18 +5730,18 @@ static inline void conv3x3s1_winograd63_transform_output_tile_fp16sa(const Mat& __fp16 tmp135c0 = r50 - r60; __fp16 tmp135c1 = r51 - r61; - __fp16 tmp00 = bias0 + r00 + tmp024a0 + tmp024b0 + tmp024c0 * 32; - __fp16 tmp01 = bias1 + r01 + tmp024a1 + tmp024b1 + tmp024c1 * 32; - __fp16 tmp10 = bias0 + tmp135a0 + tmp135b0 + tmp135b0 + tmp135c0 * 16; - __fp16 tmp11 = bias1 + tmp135a1 + tmp135b1 + tmp135b1 + tmp135c1 * 16; - __fp16 tmp20 = bias0 + tmp024a0 + tmp024b0 * 4 + tmp024c0 * 8; - __fp16 tmp21 = bias1 + tmp024a1 + tmp024b1 * 4 + tmp024c1 * 8; - __fp16 tmp30 = bias0 + tmp135a0 + tmp135b0 * 8 + tmp135c0 * 4; - __fp16 tmp31 = bias1 + tmp135a1 + tmp135b1 * 8 + tmp135c1 * 4; - __fp16 tmp40 = bias0 + tmp024a0 + tmp024b0 * 16 + tmp024c0 + tmp024c0; - __fp16 tmp41 = bias1 + tmp024a1 + tmp024b1 * 16 + tmp024c1 + tmp024c1; - __fp16 tmp50 = bias0 + r70 + tmp135a0 + tmp135b0 * 32 + tmp135c0; - __fp16 tmp51 = bias1 + r71 + tmp135a1 + tmp135b1 * 32 + tmp135c1; + __fp16 tmp00 = bias0 + r00 + tmp024a0 + tmp024b0 + tmp024c0 * (__fp16)32; + __fp16 tmp01 = bias1 + r01 + tmp024a1 + tmp024b1 + tmp024c1 * (__fp16)32; + __fp16 tmp10 = bias0 + tmp135a0 + tmp135b0 + tmp135b0 + tmp135c0 * (__fp16)16; + __fp16 tmp11 = bias1 + tmp135a1 + tmp135b1 + tmp135b1 + tmp135c1 * (__fp16)16; + __fp16 tmp20 = bias0 + tmp024a0 + tmp024b0 * (__fp16)4 + tmp024c0 * (__fp16)8; + __fp16 tmp21 = bias1 + tmp024a1 + tmp024b1 * (__fp16)4 + tmp024c1 * (__fp16)8; + __fp16 tmp30 = bias0 + tmp135a0 + tmp135b0 * (__fp16)8 + tmp135c0 * (__fp16)4; + __fp16 tmp31 = bias1 + tmp135a1 + tmp135b1 * (__fp16)8 + tmp135c1 * (__fp16)4; + __fp16 tmp40 = bias0 + tmp024a0 + tmp024b0 * (__fp16)16 + tmp024c0 + tmp024c0; + __fp16 tmp41 = bias1 + tmp024a1 + tmp024b1 * (__fp16)16 + tmp024c1 + tmp024c1; + __fp16 tmp50 = bias0 + r70 + tmp135a0 + tmp135b0 * (__fp16)32 + tmp135c0; + __fp16 tmp51 = bias1 + r71 + tmp135a1 + tmp135b1 * (__fp16)32 + tmp135c1; // if (out_elempack == 1) { @@ -5810,12 +5810,12 @@ static inline void conv3x3s1_winograd63_transform_output_tile_fp16sa(const Mat& __fp16 tmp024c = r5[0] + r6[0]; __fp16 tmp135c = r5[0] - r6[0]; - tmp[0][m] = r0[0] + tmp024a + tmp024b + tmp024c * 32; - tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * 16; - tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 8; - tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4; - tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c + tmp024c; - tmp[5][m] = r7[0] + tmp135a + tmp135b * 32 + tmp135c; + tmp[0][m] = r0[0] + tmp024a + tmp024b + tmp024c * (__fp16)32; + tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * (__fp16)16; + tmp[2][m] = tmp024a + tmp024b * (__fp16)4 + tmp024c * (__fp16)8; + tmp[3][m] = tmp135a + tmp135b * (__fp16)8 + tmp135c * (__fp16)4; + tmp[4][m] = tmp024a + tmp024b * (__fp16)16 + tmp024c + tmp024c; + tmp[5][m] = r7[0] + tmp135a + tmp135b * (__fp16)32 + tmp135c; r0 += max_jj * 8; r1 += max_jj * 8; @@ -5850,12 +5850,12 @@ static inline void conv3x3s1_winograd63_transform_output_tile_fp16sa(const Mat& __fp16 tmp024c = r5 + r6; __fp16 tmp135c = r5 - r6; - __fp16 tmp0 = bias0 + r0 + tmp024a + tmp024b + tmp024c * 32; - __fp16 tmp1 = bias0 + tmp135a + tmp135b + tmp135b + tmp135c * 16; - __fp16 tmp2 = bias0 + tmp024a + tmp024b * 4 + tmp024c * 8; - __fp16 tmp3 = bias0 + tmp135a + tmp135b * 8 + tmp135c * 4; - __fp16 tmp4 = bias0 + tmp024a + tmp024b * 16 + tmp024c + tmp024c; - __fp16 tmp5 = bias0 + r7 + tmp135a + tmp135b * 32 + tmp135c; + __fp16 tmp0 = bias0 + r0 + tmp024a + tmp024b + tmp024c * (__fp16)32; + __fp16 tmp1 = bias0 + tmp135a + tmp135b + tmp135b + tmp135c * (__fp16)16; + __fp16 tmp2 = bias0 + tmp024a + tmp024b * (__fp16)4 + tmp024c * (__fp16)8; + __fp16 tmp3 = bias0 + tmp135a + tmp135b * (__fp16)8 + tmp135c * (__fp16)4; + __fp16 tmp4 = bias0 + tmp024a + tmp024b * (__fp16)16 + tmp024c + tmp024c; + __fp16 tmp5 = bias0 + r7 + tmp135a + tmp135b * (__fp16)32 + tmp135c; // if (out_elempack == 1) { diff --git a/src/layer/arm/convolution_arm_asimdhp.cpp b/src/layer/arm/convolution_arm_asimdhp.cpp index 4d229dc4db7..6480aa2e78a 100644 --- a/src/layer/arm/convolution_arm_asimdhp.cpp +++ b/src/layer/arm/convolution_arm_asimdhp.cpp @@ -34,12 +34,14 @@ namespace ncnn { #include "convolution_im2col_gemm_bf16s_fp16s.h" #include "convolution_im2col_gemm_fp16s.h" +#if NCNN_GNU_INLINE_ASM #include "convolution_3x3_pack4_fp16s.h" #include "convolution_3x3_pack1to8_fp16s.h" #include "convolution_3x3_pack1to4_fp16s.h" #include "convolution_3x3_pack8_fp16s.h" #include "convolution_5x5_pack8_fp16s.h" #include "convolution_7x7_pack1to8_fp16s.h" +#endif // NCNN_GNU_INLINE_ASM #endif #endif // __ARM_NEON @@ -122,6 +124,7 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt) int l2_cache_size_fp16 = get_cpu_level2_cache_size() / sizeof(unsigned short); bool prefer_sgemm = num_input * num_output * kernel_w * kernel_h * dilation_w * dilation_h * stride_w * stride_h * 2 > l2_cache_size_fp16 || (num_input > 16 || num_output > 16); +#if NCNN_GNU_INLINE_ASM if (elempack == 8 && out_elempack == 8) { if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) @@ -181,6 +184,7 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt) prefer_sgemm = false; } } +#endif // NCNN_GNU_INLINE_ASM if (opt.use_fp16_arithmetic && ((opt.use_sgemm_convolution && prefer_sgemm) || (kernel_w == 1 && kernel_h == 1))) { @@ -196,6 +200,7 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt) return 0; } +#if NCNN_GNU_INLINE_ASM if ((elempack == 8 && out_elempack == 8 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) || (elempack == 8 && out_elempack == 8 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) || (elempack == 8 && out_elempack == 8 && kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) @@ -210,6 +215,7 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt) convolution_transform_kernel_packed_fp16s_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); } else +#endif // NCNN_GNU_INLINE_ASM { convolution_transform_kernel_packed_fp16s(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); } @@ -399,6 +405,7 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const int l2_cache_size_fp16 = get_cpu_level2_cache_size() / sizeof(unsigned short); bool prefer_sgemm = num_input * num_output * kernel_w * kernel_h * dilation_w * dilation_h * stride_w * stride_h * 2 > l2_cache_size_fp16 || (num_input > 16 || num_output > 16); +#if NCNN_GNU_INLINE_ASM if (elempack == 8 && out_elempack == 8) { if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) @@ -458,6 +465,7 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const prefer_sgemm = false; } } +#endif // NCNN_GNU_INLINE_ASM if ((opt.use_sgemm_convolution && prefer_sgemm) || (kernel_w == 1 && kernel_h == 1)) { @@ -478,6 +486,7 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const return 0; } +#if NCNN_GNU_INLINE_ASM if (elempack == 8 && out_elempack == 8) { if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) @@ -634,6 +643,11 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const convolution_packed_fp16sa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); } } +#else // NCNN_GNU_INLINE_ASM + { + convolution_packed_fp16sa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } +#endif // NCNN_GNU_INLINE_ASM return 0; } diff --git a/src/layer/arm/convolution_packed_fp16s.h b/src/layer/arm/convolution_packed_fp16s.h index dddb1ec544d..d565fb23fb9 100644 --- a/src/layer/arm/convolution_packed_fp16s.h +++ b/src/layer/arm/convolution_packed_fp16s.h @@ -1428,7 +1428,7 @@ static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, con _sum2 = vaddq_f16(_sum2, _sum3); _sum0 = vaddq_f16(_sum0, _sum2); - _sum0 = activation_ps(_sum0, activation_type, activation_params); + _sum0 = activation_ps_f16(_sum0, activation_type, activation_params); if (out_elempack == 8) { @@ -1621,7 +1621,7 @@ static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, con _sum2 = vadd_f16(_sum2, _sum3); _sum0 = vadd_f16(_sum0, _sum2); - _sum0 = activation_ps(_sum0, activation_type, activation_params); + _sum0 = activation_ps_f16(_sum0, activation_type, activation_params); if (out_elempack == 4) { @@ -1787,8 +1787,8 @@ static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, con sum0 += vget_lane_f16(_ss, 0); sum1 += vget_lane_f16(_ss, 1); - sum0 = activation_ss(sum0, activation_type, activation_params); - sum1 = activation_ss(sum1, activation_type, activation_params); + sum0 = activation_ss_f16(sum0, activation_type, activation_params); + sum1 = activation_ss_f16(sum1, activation_type, activation_params); outptr0[0] = sum0; outptr1[0] = sum1; @@ -1923,7 +1923,7 @@ static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, con _ss = vpadd_f16(_ss, _ss); sum += vget_lane_f16(_ss, 0); - sum = activation_ss(sum, activation_type, activation_params); + sum = activation_ss_f16(sum, activation_type, activation_params); outptr[0] = sum; outptr += 1; diff --git a/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp b/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp index 94c5cb0176e..f7d2cfee84c 100644 --- a/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp +++ b/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp @@ -431,7 +431,7 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl _sum = vfmaq_f16(_sum, _val, _w); } - _sum = activation_ps(_sum, activation_type, activation_params); + _sum = activation_ps_f16(_sum, activation_type, activation_params); vst1q_f16(outptr + j * 8, _sum); } @@ -493,7 +493,7 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl _sum = vfma_f16(_sum, _val, _w); } - _sum = activation_ps(_sum, activation_type, activation_params); + _sum = activation_ps_f16(_sum, activation_type, activation_params); vst1_f16(outptr + j * 4, _sum); } @@ -574,7 +574,7 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl sum += val * w; } - sum = activation_ss(sum, activation_type, activation_params); + sum = activation_ss_f16(sum, activation_type, activation_params); outptr[j] = (__fp16)sum; } diff --git a/src/layer/arm/deconvolution_arm_asimdhp.cpp b/src/layer/arm/deconvolution_arm_asimdhp.cpp index c786a541f5b..c98ba40309b 100644 --- a/src/layer/arm/deconvolution_arm_asimdhp.cpp +++ b/src/layer/arm/deconvolution_arm_asimdhp.cpp @@ -767,7 +767,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con kptr += maxk * 64; } - _sum = activation_ps(_sum, activation_type, activation_params); + _sum = activation_ps_f16(_sum, activation_type, activation_params); vst1q_f16(outptr + j * 8, _sum); } @@ -838,7 +838,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con kptr += maxk * 8; } - _sum = activation_ps(_sum, activation_type, activation_params); + _sum = activation_ps_f16(_sum, activation_type, activation_params); vst1q_f16(outptr + j * 8, _sum); } @@ -915,7 +915,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con kptr += maxk * 32; } - _sum = activation_ps(_sum, activation_type, activation_params); + _sum = activation_ps_f16(_sum, activation_type, activation_params); vst1q_f16(outptr + j * 8, _sum); } @@ -989,7 +989,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con kptr += maxk * 8; } - sum = activation_ss(sum, activation_type, activation_params); + sum = activation_ss_f16(sum, activation_type, activation_params); outptr[j] = (__fp16)sum; } @@ -1074,7 +1074,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con kptr += maxk * 32; } - _sum = activation_ps(_sum, activation_type, activation_params); + _sum = activation_ps_f16(_sum, activation_type, activation_params); vst1_f16(outptr + j * 4, _sum); } @@ -1151,7 +1151,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con kptr += maxk * 16; } - _sum = activation_ps(_sum, activation_type, activation_params); + _sum = activation_ps_f16(_sum, activation_type, activation_params); vst1_f16(outptr + j * 4, _sum); } @@ -1222,7 +1222,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con kptr += maxk * 4; } - _sum = activation_ps(_sum, activation_type, activation_params); + _sum = activation_ps_f16(_sum, activation_type, activation_params); vst1_f16(outptr + j * 4, _sum); } @@ -1295,7 +1295,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con kptr += maxk * 4; } - sum = activation_ss(sum, activation_type, activation_params); + sum = activation_ss_f16(sum, activation_type, activation_params); outptr[j] = (__fp16)sum; } @@ -1377,7 +1377,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con kptr += maxk; } - sum = activation_ss(sum, activation_type, activation_params); + sum = activation_ss_f16(sum, activation_type, activation_params); outptr[j] = (__fp16)sum; } diff --git a/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp b/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp index 317a7106ece..09e0fca4356 100644 --- a/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp +++ b/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp @@ -464,7 +464,7 @@ int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_ } } - _sum = activation_ps(_sum, activation_type, activation_params); + _sum = activation_ps_f16(_sum, activation_type, activation_params); vst1q_f16(outptr + j * 8, _sum); } @@ -528,7 +528,7 @@ int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_ } } - _sum = activation_ps(_sum, activation_type, activation_params); + _sum = activation_ps_f16(_sum, activation_type, activation_params); vst1_f16(outptr + j * 4, _sum); } @@ -592,7 +592,7 @@ int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_ } } - sum = activation_ss(sum, activation_type, activation_params); + sum = activation_ss_f16(sum, activation_type, activation_params); outptr[j] = (__fp16)sum; } diff --git a/src/layer/arm/dequantize_arm_asimdhp.cpp b/src/layer/arm/dequantize_arm_asimdhp.cpp index 62977abff9a..3e03c8638dd 100644 --- a/src/layer/arm/dequantize_arm_asimdhp.cpp +++ b/src/layer/arm/dequantize_arm_asimdhp.cpp @@ -16,6 +16,7 @@ #if __ARM_NEON #include +#include "arm_usability.h" #endif // __ARM_NEON namespace ncnn { diff --git a/src/layer/arm/eltwise_arm_asimdhp.cpp b/src/layer/arm/eltwise_arm_asimdhp.cpp index 4335db6c34b..0d9cb62ac07 100644 --- a/src/layer/arm/eltwise_arm_asimdhp.cpp +++ b/src/layer/arm/eltwise_arm_asimdhp.cpp @@ -16,6 +16,7 @@ #if __ARM_NEON #include +#include "arm_usability.h" #endif // __ARM_NEON namespace ncnn { diff --git a/src/layer/arm/gelu_arm_asimdhp.cpp b/src/layer/arm/gelu_arm_asimdhp.cpp index ea8b159cfa8..0b4b59ab5f9 100644 --- a/src/layer/arm/gelu_arm_asimdhp.cpp +++ b/src/layer/arm/gelu_arm_asimdhp.cpp @@ -16,6 +16,7 @@ #if __ARM_NEON #include +#include "arm_usability.h" #include "neon_mathfun.h" #if NCNN_ARM82 #include "neon_mathfun_fp16s.h" @@ -92,7 +93,7 @@ int GELU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co _blob = vmulq_f16(_pLoad, _blob); _blob = vmulq_f16(vdupq_n_f16(0.044715f * 0.79788452f), _blob); _blob = vfmaq_f16(_blob, vdupq_n_f16(0.79788452f), _pLoad); - _blob = tanh_ps(_blob); + _blob = tanh_ps_f16(_blob); _blob = vaddq_f16(vdupq_n_f16(1.f), _blob); _blob = vmulq_f16(vdupq_n_f16(0.5f), vmulq_f16(_blob, _pLoad)); vst1q_f16(ptr, _blob); @@ -101,7 +102,7 @@ int GELU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co for (; i < size; i++) { - *ptr = (__fp16)0.5f * *ptr * ((__fp16)1.0f + tanhf((__fp16)0.79788452f * (*ptr + (__fp16)0.044715f * *ptr * *ptr * *ptr))); + *ptr = (__fp16)0.5f * *ptr * (__fp16)(1.0f + tanhf((__fp16)0.79788452f * (*ptr + (__fp16)0.044715f * *ptr * *ptr * *ptr))); ptr++; } } diff --git a/src/layer/arm/gemm_arm_asimdhp.cpp b/src/layer/arm/gemm_arm_asimdhp.cpp index 155658ab077..ff840df3b50 100644 --- a/src/layer/arm/gemm_arm_asimdhp.cpp +++ b/src/layer/arm/gemm_arm_asimdhp.cpp @@ -2922,7 +2922,7 @@ int Gemm_arm::forward_fp16sa(const std::vector& bottom_blobs, std::vector +#include "arm_usability.h" #endif // __ARM_NEON namespace ncnn { diff --git a/src/layer/arm/hardswish_arm_asimdhp.cpp b/src/layer/arm/hardswish_arm_asimdhp.cpp index 18681676a9f..5ab492ef0d8 100644 --- a/src/layer/arm/hardswish_arm_asimdhp.cpp +++ b/src/layer/arm/hardswish_arm_asimdhp.cpp @@ -16,6 +16,7 @@ #if __ARM_NEON #include +#include "arm_usability.h" #endif // __ARM_NEON namespace ncnn { diff --git a/src/layer/arm/innerproduct_arm_asimdhp.cpp b/src/layer/arm/innerproduct_arm_asimdhp.cpp index b859c307c6e..de475d5ca59 100644 --- a/src/layer/arm/innerproduct_arm_asimdhp.cpp +++ b/src/layer/arm/innerproduct_arm_asimdhp.cpp @@ -120,14 +120,14 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons kptr += 8; } - _sum0 = activation_ps(_sum0, activation_type, activation_params); - _sum1 = activation_ps(_sum1, activation_type, activation_params); - _sum2 = activation_ps(_sum2, activation_type, activation_params); - _sum3 = activation_ps(_sum3, activation_type, activation_params); - _sum4 = activation_ps(_sum4, activation_type, activation_params); - _sum5 = activation_ps(_sum5, activation_type, activation_params); - _sum6 = activation_ps(_sum6, activation_type, activation_params); - _sum7 = activation_ps(_sum7, activation_type, activation_params); + _sum0 = activation_ps_f16(_sum0, activation_type, activation_params); + _sum1 = activation_ps_f16(_sum1, activation_type, activation_params); + _sum2 = activation_ps_f16(_sum2, activation_type, activation_params); + _sum3 = activation_ps_f16(_sum3, activation_type, activation_params); + _sum4 = activation_ps_f16(_sum4, activation_type, activation_params); + _sum5 = activation_ps_f16(_sum5, activation_type, activation_params); + _sum6 = activation_ps_f16(_sum6, activation_type, activation_params); + _sum7 = activation_ps_f16(_sum7, activation_type, activation_params); vst1q_f16(outptr, _sum0); vst1q_f16(outptr + 8, _sum1); @@ -167,7 +167,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons kptr += 8; } - _sum = activation_ps(_sum, activation_type, activation_params); + _sum = activation_ps_f16(_sum, activation_type, activation_params); vst1q_f16(outptr, _sum); outptr += 8; @@ -221,14 +221,14 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons kptr += 8; } - _sum0 = activation_ps(_sum0, activation_type, activation_params); - _sum1 = activation_ps(_sum1, activation_type, activation_params); - _sum2 = activation_ps(_sum2, activation_type, activation_params); - _sum3 = activation_ps(_sum3, activation_type, activation_params); - _sum4 = activation_ps(_sum4, activation_type, activation_params); - _sum5 = activation_ps(_sum5, activation_type, activation_params); - _sum6 = activation_ps(_sum6, activation_type, activation_params); - _sum7 = activation_ps(_sum7, activation_type, activation_params); + _sum0 = activation_ps_f16(_sum0, activation_type, activation_params); + _sum1 = activation_ps_f16(_sum1, activation_type, activation_params); + _sum2 = activation_ps_f16(_sum2, activation_type, activation_params); + _sum3 = activation_ps_f16(_sum3, activation_type, activation_params); + _sum4 = activation_ps_f16(_sum4, activation_type, activation_params); + _sum5 = activation_ps_f16(_sum5, activation_type, activation_params); + _sum6 = activation_ps_f16(_sum6, activation_type, activation_params); + _sum7 = activation_ps_f16(_sum7, activation_type, activation_params); vst1_f16(outptr, _sum0); vst1_f16(outptr + 4, _sum1); @@ -268,7 +268,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons kptr += 1; } - _sum = activation_ps(_sum, activation_type, activation_params); + _sum = activation_ps_f16(_sum, activation_type, activation_params); vst1q_f16(outptr, _sum); outptr += 8; @@ -310,10 +310,10 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons kptr += 4; } - _sum0 = activation_ps(_sum0, activation_type, activation_params); - _sum1 = activation_ps(_sum1, activation_type, activation_params); - _sum2 = activation_ps(_sum2, activation_type, activation_params); - _sum3 = activation_ps(_sum3, activation_type, activation_params); + _sum0 = activation_ps_f16(_sum0, activation_type, activation_params); + _sum1 = activation_ps_f16(_sum1, activation_type, activation_params); + _sum2 = activation_ps_f16(_sum2, activation_type, activation_params); + _sum3 = activation_ps_f16(_sum3, activation_type, activation_params); vst1q_f16(outptr, _sum0); vst1q_f16(outptr + 8, _sum1); @@ -358,10 +358,10 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons kptr += 4; } - _sum0 = activation_ps(_sum0, activation_type, activation_params); - _sum1 = activation_ps(_sum1, activation_type, activation_params); - _sum2 = activation_ps(_sum2, activation_type, activation_params); - _sum3 = activation_ps(_sum3, activation_type, activation_params); + _sum0 = activation_ps_f16(_sum0, activation_type, activation_params); + _sum1 = activation_ps_f16(_sum1, activation_type, activation_params); + _sum2 = activation_ps_f16(_sum2, activation_type, activation_params); + _sum3 = activation_ps_f16(_sum3, activation_type, activation_params); vst1_f16(outptr, _sum0); vst1_f16(outptr + 4, _sum1); @@ -397,7 +397,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons kptr += 4; } - _sum = activation_ps(_sum, activation_type, activation_params); + _sum = activation_ps_f16(_sum, activation_type, activation_params); vst1_f16(outptr, _sum); outptr += 4; @@ -430,7 +430,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons kptr += 1; } - _sum = activation_ps(_sum, activation_type, activation_params); + _sum = activation_ps_f16(_sum, activation_type, activation_params); vst1_f16(outptr, _sum); outptr += 4; @@ -461,7 +461,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons kptr += 1; } - sum = activation_ss(sum, activation_type, activation_params); + sum = activation_ss_f16(sum, activation_type, activation_params); outptr[0] = (__fp16)sum; outptr += 1; @@ -614,7 +614,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons _sum4 = vaddq_f16(_sum4, _sum6); _sum0 = vaddq_f16(_sum0, _sum4); - _sum0 = activation_ps(_sum0, activation_type, activation_params); + _sum0 = activation_ps_f16(_sum0, activation_type, activation_params); __fp16* outptr = (__fp16*)top_blob; vst1q_f16(outptr + p * 8, _sum0); @@ -739,7 +739,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons _sum4 = vadd_f16(_sum4, _sum6); _sum0 = vadd_f16(_sum0, _sum4); - _sum0 = activation_ps(_sum0, activation_type, activation_params); + _sum0 = activation_ps_f16(_sum0, activation_type, activation_params); __fp16* outptr = (__fp16*)top_blob; vst1_f16(outptr + p * 4, _sum0); @@ -787,7 +787,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons float16x4_t _s4 = vadd_f16(vget_low_f16(_sum), vget_high_f16(_sum)); sum += vaddvq_f32(vcvt_f32_f16(_s4)); // dot - sum = activation_ss(sum, activation_type, activation_params); + sum = activation_ss_f16(sum, activation_type, activation_params); __fp16* outptr = (__fp16*)top_blob; outptr[p] = (__fp16)sum; diff --git a/src/layer/arm/instancenorm_arm_asimdhp.cpp b/src/layer/arm/instancenorm_arm_asimdhp.cpp index 4ea0b6d55b7..67e408122ff 100644 --- a/src/layer/arm/instancenorm_arm_asimdhp.cpp +++ b/src/layer/arm/instancenorm_arm_asimdhp.cpp @@ -16,6 +16,7 @@ #if __ARM_NEON #include +#include "arm_usability.h" #endif // __ARM_NEON namespace ncnn { @@ -231,8 +232,9 @@ int InstanceNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& #endif // __ARM_NEON for (; i < size; i++) { - float tmp = *ptr++ - mean; + float tmp = (float)*ptr - mean; sqsum += tmp * tmp; + ptr++; } float var = sqsum / size; // the var maybe minus due to accuracy @@ -245,13 +247,15 @@ int InstanceNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& float gamma = gamma_data[q]; float beta = beta_data[q]; - a = (__fp16)(gamma / (sqrtf(var + eps))); - b = (__fp16)(-mean * a + beta); + float a_fp32 = gamma / (sqrtf(var + eps)); + a = (__fp16)(a_fp32); + b = (__fp16)(-mean * a_fp32 + beta); } else { - a = (__fp16)(1.f / (sqrtf(var + eps))); - b = (__fp16)(-mean * a); + float a_fp32 = 1.f / (sqrtf(var + eps)); + a = (__fp16)(a_fp32); + b = (__fp16)(-mean * a_fp32); } i = 0; diff --git a/src/layer/arm/interp_arm_asimdhp.cpp b/src/layer/arm/interp_arm_asimdhp.cpp index 286c74fe40c..b7d78d7bb91 100644 --- a/src/layer/arm/interp_arm_asimdhp.cpp +++ b/src/layer/arm/interp_arm_asimdhp.cpp @@ -16,6 +16,7 @@ #if __ARM_NEON #include +#include "arm_usability.h" #endif // __ARM_NEON namespace ncnn { diff --git a/src/layer/arm/interp_bicubic_fp16s.h b/src/layer/arm/interp_bicubic_fp16s.h index be3e8159d87..18b2e56799e 100644 --- a/src/layer/arm/interp_bicubic_fp16s.h +++ b/src/layer/arm/interp_bicubic_fp16s.h @@ -24,7 +24,7 @@ static inline void interpolate_cubic_fp16sa(float fx, __fp16* coeffs) coeffs[0] = (__fp16)(A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A); coeffs[1] = (__fp16)((A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1); coeffs[2] = (__fp16)((A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1); - coeffs[3] = (__fp16)(1.f - coeffs[0] - coeffs[1] - coeffs[2]); + coeffs[3] = (__fp16)((__fp16)1.f - coeffs[0] - coeffs[1] - coeffs[2]); } static void cubic_coeffs_fp16sa(int w, int outw, int* xofs, __fp16* alpha, int align_corner) @@ -51,7 +51,7 @@ static void cubic_coeffs_fp16sa(int w, int outw, int* xofs, __fp16* alpha, int a if (sx <= -1) { sx = 1; - alpha[dx * 4 + 0] = (__fp16)(1.f - alpha[dx * 4 + 3]); + alpha[dx * 4 + 0] = (__fp16)((__fp16)1.f - alpha[dx * 4 + 3]); alpha[dx * 4 + 1] = (__fp16)alpha[dx * 4 + 3]; alpha[dx * 4 + 2] = (__fp16)0.f; alpha[dx * 4 + 3] = (__fp16)0.f; @@ -75,7 +75,7 @@ static void cubic_coeffs_fp16sa(int w, int outw, int* xofs, __fp16* alpha, int a if (sx >= w - 1) { sx = w - 3; - alpha[dx * 4 + 3] = (__fp16)(1.f - alpha[dx * 4 + 0]); + alpha[dx * 4 + 3] = (__fp16)((__fp16)1.f - alpha[dx * 4 + 0]); alpha[dx * 4 + 2] = (__fp16)(alpha[dx * 4 + 0]); alpha[dx * 4 + 1] = (__fp16)0.f; alpha[dx * 4 + 0] = (__fp16)0.f; diff --git a/src/layer/arm/mish_arm_asimdhp.cpp b/src/layer/arm/mish_arm_asimdhp.cpp index 0e04883370e..70dd74ae22c 100644 --- a/src/layer/arm/mish_arm_asimdhp.cpp +++ b/src/layer/arm/mish_arm_asimdhp.cpp @@ -16,6 +16,7 @@ #if __ARM_NEON #include +#include "arm_usability.h" #include "neon_mathfun.h" #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #include "neon_mathfun_fp16s.h" @@ -99,7 +100,7 @@ int Mish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co for (int i = 0; i < size; i++) { float16x8_t _p = vld1q_f16(ptr); - _p = vmulq_f16(_p, tanh_ps(log_ps(vaddq_f16(exp_ps(_p), vdupq_n_f16(1.f))))); + _p = vmulq_f16(_p, tanh_ps_f16(log_ps_f16(vaddq_f16(exp_ps_f16(_p), vdupq_n_f16(1.f))))); vst1q_f16(ptr, _p); ptr += 8; @@ -119,7 +120,7 @@ int Mish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co for (int i = 0; i < size; i++) { float16x4_t _p = vld1_f16(ptr); - _p = vmul_f16(_p, tanh_ps(log_ps(vadd_f16(exp_ps(_p), vdup_n_f16(1.f))))); + _p = vmul_f16(_p, tanh_ps_f16(log_ps_f16(vadd_f16(exp_ps_f16(_p), vdup_n_f16(1.f))))); vst1_f16(ptr, _p); ptr += 4; @@ -138,7 +139,7 @@ int Mish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co for (; i + 3 < size; i += 4) { float16x4_t _p = vld1_f16(ptr); - _p = vmul_f16(_p, tanh_ps(log_ps(vadd_f16(exp_ps(_p), vdup_n_f16(1.f))))); + _p = vmul_f16(_p, tanh_ps_f16(log_ps_f16(vadd_f16(exp_ps_f16(_p), vdup_n_f16(1.f))))); vst1_f16(ptr, _p); ptr += 4; @@ -146,7 +147,7 @@ int Mish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co for (; i < size; i++) { __fp16 v = *ptr; - v = v * tanhf(logf(expf(v) + (__fp16)1.f)); + v = v * (__fp16)tanhf(logf(expf(v) + 1.f)); *ptr = v; ptr++; } diff --git a/src/layer/arm/neon_mathfun_fp16s.h b/src/layer/arm/neon_mathfun_fp16s.h index 074681809bf..2f4864c13a3 100644 --- a/src/layer/arm/neon_mathfun_fp16s.h +++ b/src/layer/arm/neon_mathfun_fp16s.h @@ -61,7 +61,7 @@ /* natural logarithm computed for 4 simultaneous float * return NaN for x <= 0 */ -static inline float16x4_t log_ps(float16x4_t x) +static inline float16x4_t log_ps_f16(float16x4_t x) { float16x4_t one = vdup_n_f16(1); @@ -119,7 +119,7 @@ static inline float16x4_t log_ps(float16x4_t x) return x; } -static inline float16x8_t log_ps(float16x8_t x) +static inline float16x8_t log_ps_f16(float16x8_t x) { float16x8_t one = vdupq_n_f16(1); @@ -192,7 +192,7 @@ static inline float16x8_t log_ps(float16x8_t x) #define c_cephes_exp_p5 5.0000001201E-1 /* exp() computed for 4 float at once */ -static inline float16x4_t exp_ps(float16x4_t x) +static inline float16x4_t exp_ps_f16(float16x4_t x) { float16x4_t tmp, fx; @@ -201,7 +201,11 @@ static inline float16x4_t exp_ps(float16x4_t x) x = vmax_f16(x, vdup_n_f16(c_exp_lo_f16)); /* express exp(x) as exp(g + n*log(2)) */ +#if _MSC_VER + fx = vfma_f16(vdup_n_f16(0.5f), x, vcvt_f16_f32(vdupq_n_f32(c_cephes_LOG2EF))); +#else fx = vfma_f16(vdup_n_f16(0.5f), x, vdup_n_f16(c_cephes_LOG2EF)); +#endif /* perform a floorf */ tmp = vcvt_f16_s16(vcvt_s16_f16(fx)); @@ -212,8 +216,13 @@ static inline float16x4_t exp_ps(float16x4_t x) fx = vsub_f16(tmp, (float16x4_t)(mask)); +#if _MSC_VER + tmp = vmul_f16(fx, vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1))); + float16x4_t z = vmul_f16(fx, vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C2))); +#else tmp = vmul_f16(fx, vdup_n_f16(c_cephes_exp_C1)); float16x4_t z = vmul_f16(fx, vdup_n_f16(c_cephes_exp_C2)); +#endif x = vsub_f16(x, tmp); x = vsub_f16(x, z); @@ -240,7 +249,7 @@ static inline float16x4_t exp_ps(float16x4_t x) return y; } -static inline float16x8_t exp_ps(float16x8_t x) +static inline float16x8_t exp_ps_f16(float16x8_t x) { float16x8_t tmp, fx; @@ -249,7 +258,12 @@ static inline float16x8_t exp_ps(float16x8_t x) x = vmaxq_f16(x, vdupq_n_f16(c_exp_lo_f16)); /* express exp(x) as exp(g + n*log(2)) */ +#if _MSC_VER + float16x4_t _c_cephes_LOG2EF = vcvt_f16_f32(vdupq_n_f32(c_cephes_LOG2EF)); + fx = vfmaq_f16(vdupq_n_f16(0.5f), x, vcombine_f16(_c_cephes_LOG2EF, _c_cephes_LOG2EF)); +#else fx = vfmaq_f16(vdupq_n_f16(0.5f), x, vdupq_n_f16(c_cephes_LOG2EF)); +#endif /* perform a floorf */ tmp = vcvtq_f16_s16(vcvtq_s16_f16(fx)); @@ -260,8 +274,15 @@ static inline float16x8_t exp_ps(float16x8_t x) fx = vsubq_f16(tmp, vreinterpretq_f16_u16(mask)); +#if _MSC_VER + float16x4_t _c_cephes_exp_C1 = vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1)); + tmp = vmulq_f16(fx, vcombine_f16(_c_cephes_exp_C1, _c_cephes_exp_C1)); + float16x4_t _c_cephes_exp_C2 = vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C2)); + float16x8_t z = vmulq_f16(fx, vcombine_f16(_c_cephes_exp_C2, _c_cephes_exp_C2)); +#else tmp = vmulq_f16(fx, vdupq_n_f16(c_cephes_exp_C1)); float16x8_t z = vmulq_f16(fx, vdupq_n_f16(c_cephes_exp_C2)); +#endif x = vsubq_f16(x, tmp); x = vsubq_f16(x, z); @@ -314,7 +335,7 @@ static inline float16x8_t exp_ps(float16x8_t x) * almost no extra price so both sin_ps and cos_ps make use of * sincos_ps.. */ -static inline void sincos_ps(float16x4_t x, float16x4_t* ysin, float16x4_t* ycos) +static inline void sincos_ps_f16(float16x4_t x, float16x4_t* ysin, float16x4_t* ycos) { // any x float16x4_t y; @@ -326,7 +347,12 @@ static inline void sincos_ps(float16x4_t x, float16x4_t* ysin, float16x4_t* ycos x = vabs_f16(x); /* scale by 4/Pi */ +#if _MSC_VER + float16x4_t _c_cephes_FOPI = vcvt_f16_f32(vdupq_n_f32(c_cephes_FOPI)); + y = vmul_f16(x, _c_cephes_FOPI); +#else y = vmul_f16(x, vdup_n_f16(c_cephes_FOPI)); +#endif /* store the integer part of y in mm0 */ emm2 = vcvt_u16_f16(y); @@ -345,9 +371,18 @@ static inline void sincos_ps(float16x4_t x, float16x4_t* ysin, float16x4_t* ycos /* The magic pass: "Extended precision modular arithmetic" * x = ((x - y * DP1) - y * DP2) - y * DP3; */ +#if _MSC_VER + float16x4_t _c_minus_cephes_DP1 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP1)); + float16x4_t _c_minus_cephes_DP2 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP2)); + float16x4_t _c_minus_cephes_DP3 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP3)); + x = vfma_f16(x, y, _c_minus_cephes_DP1); + x = vfma_f16(x, y, _c_minus_cephes_DP2); + x = vfma_f16(x, y, _c_minus_cephes_DP3); +#else x = vfma_f16(x, y, vdup_n_f16(c_minus_cephes_DP1)); x = vfma_f16(x, y, vdup_n_f16(c_minus_cephes_DP2)); x = vfma_f16(x, y, vdup_n_f16(c_minus_cephes_DP3)); +#endif sign_mask_sin = veor_u16(sign_mask_sin, vtst_u16(emm2, vdup_n_u16(4))); sign_mask_cos = vtst_u16(vsub_u16(emm2, vdup_n_u16(2)), vdup_n_u16(4)); @@ -375,7 +410,7 @@ static inline void sincos_ps(float16x4_t x, float16x4_t* ysin, float16x4_t* ycos *ycos = vbsl_f16(sign_mask_cos, yc, vneg_f16(yc)); } -static inline void sincos_ps(float16x8_t x, float16x8_t* ysin, float16x8_t* ycos) +static inline void sincos_ps_f16(float16x8_t x, float16x8_t* ysin, float16x8_t* ycos) { // any x float16x8_t y; @@ -387,7 +422,12 @@ static inline void sincos_ps(float16x8_t x, float16x8_t* ysin, float16x8_t* ycos x = vabsq_f16(x); /* scale by 4/Pi */ +#if _MSC_VER + float16x4_t _c_cephes_FOPI = vcvt_f16_f32(vdupq_n_f32(c_cephes_FOPI)); + y = vmulq_f16(x, vcombine_f16(_c_cephes_FOPI, _c_cephes_FOPI)); +#else y = vmulq_f16(x, vdupq_n_f16(c_cephes_FOPI)); +#endif /* store the integer part of y in mm0 */ emm2 = vcvtq_u16_f16(y); @@ -406,9 +446,18 @@ static inline void sincos_ps(float16x8_t x, float16x8_t* ysin, float16x8_t* ycos /* The magic pass: "Extended precision modular arithmetic" * x = ((x - y * DP1) - y * DP2) - y * DP3; */ +#if _MSC_VER + float16x4_t _c_minus_cephes_DP1 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP1)); + float16x4_t _c_minus_cephes_DP2 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP2)); + float16x4_t _c_minus_cephes_DP3 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP3)); + x = vfmaq_f16(x, y, vcombine_f16(_c_minus_cephes_DP1, _c_minus_cephes_DP1)); + x = vfmaq_f16(x, y, vcombine_f16(_c_minus_cephes_DP2, _c_minus_cephes_DP2)); + x = vfmaq_f16(x, y, vcombine_f16(_c_minus_cephes_DP3, _c_minus_cephes_DP3)); +#else x = vfmaq_f16(x, y, vdupq_n_f16(c_minus_cephes_DP1)); x = vfmaq_f16(x, y, vdupq_n_f16(c_minus_cephes_DP2)); x = vfmaq_f16(x, y, vdupq_n_f16(c_minus_cephes_DP3)); +#endif sign_mask_sin = veorq_u16(sign_mask_sin, vtstq_u16(emm2, vdupq_n_u16(4))); sign_mask_cos = vtstq_u16(vsubq_u16(emm2, vdupq_n_u16(2)), vdupq_n_u16(4)); @@ -436,31 +485,31 @@ static inline void sincos_ps(float16x8_t x, float16x8_t* ysin, float16x8_t* ycos *ycos = vbslq_f16(sign_mask_cos, yc, vnegq_f16(yc)); } -static inline float16x4_t sin_ps(float16x4_t x) +static inline float16x4_t sin_ps_f16(float16x4_t x) { float16x4_t ysin, ycos; - sincos_ps(x, &ysin, &ycos); + sincos_ps_f16(x, &ysin, &ycos); return ysin; } -static inline float16x8_t sin_ps(float16x8_t x) +static inline float16x8_t sin_ps_f16(float16x8_t x) { float16x8_t ysin, ycos; - sincos_ps(x, &ysin, &ycos); + sincos_ps_f16(x, &ysin, &ycos); return ysin; } -static inline float16x4_t cos_ps(float16x4_t x) +static inline float16x4_t cos_ps_f16(float16x4_t x) { float16x4_t ysin, ycos; - sincos_ps(x, &ysin, &ycos); + sincos_ps_f16(x, &ysin, &ycos); return ycos; } -static inline float16x8_t cos_ps(float16x8_t x) +static inline float16x8_t cos_ps_f16(float16x8_t x) { float16x8_t ysin, ycos; - sincos_ps(x, &ysin, &ycos); + sincos_ps_f16(x, &ysin, &ycos); return ycos; } @@ -481,7 +530,7 @@ static inline float16x8_t cos_ps(float16x8_t x) #define c_tanh_beta_6 1.19825839466702e-6f /* Single precision hyperbolic tangent computed for 4 simultaneous float */ -static inline float16x4_t tanh_ps(float16x4_t x) +static inline float16x4_t tanh_ps_f16(float16x4_t x) { float16x4_t x2 = vabs_f16(x); @@ -522,7 +571,7 @@ static inline float16x4_t tanh_ps(float16x4_t x) return y; } -static inline float16x8_t tanh_ps(float16x8_t x) +static inline float16x8_t tanh_ps_f16(float16x8_t x) { float16x8_t x2 = vabsq_f16(x); @@ -563,20 +612,20 @@ static inline float16x8_t tanh_ps(float16x8_t x) return y; } -static inline float16x4_t sigmoid_ps(float16x4_t _v) +static inline float16x4_t sigmoid_ps_f16(float16x4_t _v) { float16x4_t _one = vdup_n_f16(1.f); _v = vneg_f16(_v); - _v = exp_ps(_v); + _v = exp_ps_f16(_v); _v = vadd_f16(_v, _one); return vdiv_f16(_one, _v); } -static inline float16x8_t sigmoid_ps(float16x8_t _v) +static inline float16x8_t sigmoid_ps_f16(float16x8_t _v) { float16x8_t _one = vdupq_n_f16(1.f); _v = vnegq_f16(_v); - _v = exp_ps(_v); + _v = exp_ps_f16(_v); _v = vaddq_f16(_v, _one); return vdivq_f16(_one, _v); } diff --git a/src/layer/arm/pooling_arm_asimdhp.cpp b/src/layer/arm/pooling_arm_asimdhp.cpp index 856c6c22ae3..ceabb1fda38 100644 --- a/src/layer/arm/pooling_arm_asimdhp.cpp +++ b/src/layer/arm/pooling_arm_asimdhp.cpp @@ -18,6 +18,7 @@ #if __ARM_NEON #include +#include "arm_usability.h" #endif // __ARM_NEON namespace ncnn { @@ -611,7 +612,12 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt } } +#if _MSC_VER + float16x4_t _inv_area0 = vcvt_f16_f32(vdupq_n_f32(1.f / area)); + float16x8_t _inv_area = vcombine_f16(_inv_area0, _inv_area0); +#else float16x8_t _inv_area = vdupq_n_f16((__fp16)(1.f / area)); +#endif float16x8_t _avg = vmulq_f16(_sum, _inv_area); vst1q_f16(outptr + j * 8, _avg); } @@ -666,7 +672,11 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt } } +#if _MSC_VER + float16x4_t _inv_area = vcvt_f16_f32(vdupq_n_f32(1.f / area)); +#else float16x4_t _inv_area = vdup_n_f16((__fp16)(1.f / area)); +#endif float16x4_t _avg = vmul_f16(_sum, _inv_area); vst1_f16(outptr + j * 4, _avg); } @@ -721,7 +731,7 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt } } - outptr[j] = sum / area; + outptr[j] = sum / (__fp16)area; } outptr += outw; @@ -740,7 +750,12 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt const Mat m = bottom_blob_bordered.channel(q); __fp16* outptr = top_blob.channel(q); +#if _MSC_VER + float16x4_t _inv_maxk0 = vcvt_f16_f32(vdupq_n_f32(1.f / maxk)); + float16x8_t _inv_maxk = vcombine_f16(_inv_maxk0, _inv_maxk0); +#else float16x8_t _inv_maxk = vdupq_n_f16((__fp16)(1.f / maxk)); +#endif for (int i = 0; i < outh; i++) { @@ -773,7 +788,11 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt const Mat m = bottom_blob_bordered.channel(q); __fp16* outptr = top_blob.channel(q); +#if _MSC_VER + float16x4_t _inv_maxk = vcvt_f16_f32(vdupq_n_f32(1.f / maxk)); +#else float16x4_t _inv_maxk = vdup_n_f16((__fp16)(1.f / maxk)); +#endif for (int i = 0; i < outh; i++) { @@ -820,7 +839,7 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt sum += val; } - outptr[j] = sum / maxk; + outptr[j] = sum / (__fp16)maxk; } outptr += outw; diff --git a/src/layer/arm/prelu_arm_asimdhp.cpp b/src/layer/arm/prelu_arm_asimdhp.cpp index a87526b7cc3..0d22a37c54c 100644 --- a/src/layer/arm/prelu_arm_asimdhp.cpp +++ b/src/layer/arm/prelu_arm_asimdhp.cpp @@ -16,6 +16,7 @@ #if __ARM_NEON #include +#include "arm_usability.h" #endif // __ARM_NEON namespace ncnn { @@ -475,7 +476,11 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c const float slope = num_slope > 1 ? slope_data[i] : slope_data[0]; float16x4_t _zero = vdup_n_f16(0.f); +#if _MSC_VER + float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(slope)); +#else float16x4_t _slope = vdup_n_f16((__fp16)slope); +#endif int j = 0; for (; j + 3 < w; j += 4) @@ -514,7 +519,11 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c const float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; float16x4_t _zero = vdup_n_f16(0.f); +#if _MSC_VER + float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(slope)); +#else float16x4_t _slope = vdup_n_f16((__fp16)slope); +#endif int j = 0; for (; j + 3 < size; j += 4) diff --git a/src/layer/arm/relu_arm_asimdhp.cpp b/src/layer/arm/relu_arm_asimdhp.cpp index 04f2b7f3234..55e75d88596 100644 --- a/src/layer/arm/relu_arm_asimdhp.cpp +++ b/src/layer/arm/relu_arm_asimdhp.cpp @@ -16,6 +16,7 @@ #if __ARM_NEON #include +#include "arm_usability.h" #endif // __ARM_NEON namespace ncnn { diff --git a/src/layer/arm/sigmoid_arm_asimdhp.cpp b/src/layer/arm/sigmoid_arm_asimdhp.cpp index 65c32ee3e67..5d777f2769b 100644 --- a/src/layer/arm/sigmoid_arm_asimdhp.cpp +++ b/src/layer/arm/sigmoid_arm_asimdhp.cpp @@ -16,6 +16,7 @@ #if __ARM_NEON #include +#include "arm_usability.h" #include "neon_mathfun.h" #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #include "neon_mathfun_fp16s.h" @@ -110,10 +111,10 @@ int Sigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) float16x8_t _p1 = vld1q_f16(ptr + 8); float16x8_t _p2 = vld1q_f16(ptr + 16); float16x8_t _p3 = vld1q_f16(ptr + 24); - _p0 = sigmoid_ps(_p0); - _p1 = sigmoid_ps(_p1); - _p2 = sigmoid_ps(_p2); - _p3 = sigmoid_ps(_p3); + _p0 = sigmoid_ps_f16(_p0); + _p1 = sigmoid_ps_f16(_p1); + _p2 = sigmoid_ps_f16(_p2); + _p3 = sigmoid_ps_f16(_p3); vst1q_f16(ptr, _p0); vst1q_f16(ptr + 8, _p1); vst1q_f16(ptr + 16, _p2); @@ -124,8 +125,8 @@ int Sigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) { float16x8_t _p0 = vld1q_f16(ptr); float16x8_t _p1 = vld1q_f16(ptr + 8); - _p0 = sigmoid_ps(_p0); - _p1 = sigmoid_ps(_p1); + _p0 = sigmoid_ps_f16(_p0); + _p1 = sigmoid_ps_f16(_p1); vst1q_f16(ptr, _p0); vst1q_f16(ptr + 8, _p1); ptr += 16; @@ -133,14 +134,14 @@ int Sigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) for (; i + 7 < size; i += 8) { float16x8_t _p = vld1q_f16(ptr); - _p = sigmoid_ps(_p); + _p = sigmoid_ps_f16(_p); vst1q_f16(ptr, _p); ptr += 8; } for (; i + 3 < size; i += 4) { float16x4_t _p = vld1_f16(ptr); - _p = sigmoid_ps(_p); + _p = sigmoid_ps_f16(_p); vst1_f16(ptr, _p); ptr += 4; } diff --git a/src/layer/arm/softmax_arm_asimdhp.cpp b/src/layer/arm/softmax_arm_asimdhp.cpp index d8efaf4c3b9..844e32ce908 100644 --- a/src/layer/arm/softmax_arm_asimdhp.cpp +++ b/src/layer/arm/softmax_arm_asimdhp.cpp @@ -18,6 +18,7 @@ #if __ARM_NEON #include +#include "arm_usability.h" #include "neon_mathfun_fp16s.h" #endif // __ARM_NEON @@ -72,7 +73,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) for (; i + 7 < size; i += 8) { float16x8_t _p = vld1q_f16(ptr); - _p = exp_ps(vsubq_f16(_p, _max)); + _p = exp_ps_f16(vsubq_f16(_p, _max)); vst1q_f16(ptr, _p); _sum = vaddq_f16(_sum, _p); ptr += 8; @@ -80,7 +81,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) for (; i + 3 < size; i += 4) { float16x4_t _p = vld1_f16(ptr); - _p = exp_ps(vsub_f16(_p, vget_low_f16(_max))); + _p = exp_ps_f16(vsub_f16(_p, vget_low_f16(_max))); vst1_f16(ptr, _p); _sum = vcombine_f16(vadd_f16(vget_low_f16(_sum), _p), vget_high_f16(_sum)); ptr += 4; @@ -243,10 +244,10 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) float16x8_t _p3 = vld1q_f16(ptr + 24); float16x4_t _max = vld1_f16(maxptr); float16x4_t _sum = vld1_f16(sumptr); - _p0 = exp_ps(vsubq_f16(_p0, vdupq_lane_f16(_max, 0))); - _p1 = exp_ps(vsubq_f16(_p1, vdupq_lane_f16(_max, 1))); - _p2 = exp_ps(vsubq_f16(_p2, vdupq_lane_f16(_max, 2))); - _p3 = exp_ps(vsubq_f16(_p3, vdupq_lane_f16(_max, 3))); + _p0 = exp_ps_f16(vsubq_f16(_p0, vdupq_lane_f16(_max, 0))); + _p1 = exp_ps_f16(vsubq_f16(_p1, vdupq_lane_f16(_max, 1))); + _p2 = exp_ps_f16(vsubq_f16(_p2, vdupq_lane_f16(_max, 2))); + _p3 = exp_ps_f16(vsubq_f16(_p3, vdupq_lane_f16(_max, 3))); vst1q_f16(ptr, _p0); vst1q_f16(ptr + 8, _p1); vst1q_f16(ptr + 16, _p2); @@ -264,7 +265,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) { float16x8_t _p = vld1q_f16(ptr); float16x8_t _max = vdupq_n_f16(*maxptr); - _p = exp_ps(vsubq_f16(_p, _max)); + _p = exp_ps_f16(vsubq_f16(_p, _max)); vst1q_f16(ptr, _p); float16x4_t _sum2 = vadd_f16(vget_low_f16(_p), vget_high_f16(_p)); float16x4_t _ss2 = vpadd_f16(_sum2, _sum2); @@ -286,8 +287,8 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) float16x4_t _sum = vld1_f16(sumptr); float16x8_t _max0 = vcombine_f16(vdup_lane_f16(_max, 0), vdup_lane_f16(_max, 1)); float16x8_t _max1 = vcombine_f16(vdup_lane_f16(_max, 2), vdup_lane_f16(_max, 3)); - _p0 = exp_ps(vsubq_f16(_p0, _max0)); - _p1 = exp_ps(vsubq_f16(_p1, _max1)); + _p0 = exp_ps_f16(vsubq_f16(_p0, _max0)); + _p1 = exp_ps_f16(vsubq_f16(_p1, _max1)); vst1q_f16(ptr, _p0); vst1q_f16(ptr + 8, _p1); float16x8_t _ss2 = vpaddq_f16(_p0, _p1); @@ -301,7 +302,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) { float16x4_t _p = vld1_f16(ptr); float16x4_t _max = vdup_n_f16(*maxptr); - _p = exp_ps(vsub_f16(_p, _max)); + _p = exp_ps_f16(vsub_f16(_p, _max)); vst1_f16(ptr, _p); float16x4_t _ss2 = vpadd_f16(_p, _p); __fp16 sum0 = vget_lane_f16(_ss2, 0) + vget_lane_f16(_ss2, 1); @@ -319,7 +320,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) float16x8_t _p = vld1q_f16(ptr); float16x8_t _max = vld1q_f16(maxptr); float16x8_t _sum = vld1q_f16(sumptr); - _p = exp_ps(vsubq_f16(_p, _max)); + _p = exp_ps_f16(vsubq_f16(_p, _max)); _sum = vaddq_f16(_sum, _p); vst1q_f16(ptr, _p); vst1q_f16(sumptr, _sum); @@ -332,7 +333,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) float16x4_t _p = vld1_f16(ptr); float16x4_t _max = vld1_f16(maxptr); float16x4_t _sum = vld1_f16(sumptr); - _p = exp_ps(vsub_f16(_p, _max)); + _p = exp_ps_f16(vsub_f16(_p, _max)); _sum = vadd_f16(_sum, _p); vst1_f16(ptr, _p); vst1_f16(sumptr, _sum); @@ -465,7 +466,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) for (; j + 7 < size; j += 8) { float16x8_t _p = vld1q_f16(ptr); - _p = exp_ps(vsubq_f16(_p, _max)); + _p = exp_ps_f16(vsubq_f16(_p, _max)); vst1q_f16(ptr, _p); _sum = vaddq_f16(_sum, _p); ptr += 8; @@ -473,7 +474,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) for (; j + 3 < size; j += 4) { float16x4_t _p = vld1_f16(ptr); - _p = exp_ps(vsub_f16(_p, vget_low_f16(_max))); + _p = exp_ps_f16(vsub_f16(_p, vget_low_f16(_max))); vst1_f16(ptr, _p); _sum = vcombine_f16(vadd_f16(vget_low_f16(_sum), _p), vget_high_f16(_sum)); ptr += 4; @@ -643,10 +644,10 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) float16x8_t _p2 = vld1q_f16(ptr + 16); float16x8_t _p3 = vld1q_f16(ptr + 24); float16x4_t _max = vld1_f16(maxptr); - _p0 = exp_ps(vsubq_f16(_p0, vdupq_lane_f16(_max, 0))); - _p1 = exp_ps(vsubq_f16(_p1, vdupq_lane_f16(_max, 1))); - _p2 = exp_ps(vsubq_f16(_p2, vdupq_lane_f16(_max, 2))); - _p3 = exp_ps(vsubq_f16(_p3, vdupq_lane_f16(_max, 3))); + _p0 = exp_ps_f16(vsubq_f16(_p0, vdupq_lane_f16(_max, 0))); + _p1 = exp_ps_f16(vsubq_f16(_p1, vdupq_lane_f16(_max, 1))); + _p2 = exp_ps_f16(vsubq_f16(_p2, vdupq_lane_f16(_max, 2))); + _p3 = exp_ps_f16(vsubq_f16(_p3, vdupq_lane_f16(_max, 3))); vst1q_f16(ptr, _p0); vst1q_f16(ptr + 8, _p1); vst1q_f16(ptr + 16, _p2); @@ -658,7 +659,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) { float16x8_t _p = vld1q_f16(ptr); float16x8_t _max = vdupq_n_f16(*maxptr); - _p = exp_ps(vsubq_f16(_p, _max)); + _p = exp_ps_f16(vsubq_f16(_p, _max)); vst1q_f16(ptr, _p); ptr += 8; maxptr++; @@ -674,8 +675,8 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) float16x4_t _max = vld1_f16(maxptr); float16x8_t _max0 = vcombine_f16(vdup_lane_f16(_max, 0), vdup_lane_f16(_max, 1)); float16x8_t _max1 = vcombine_f16(vdup_lane_f16(_max, 2), vdup_lane_f16(_max, 3)); - _p0 = exp_ps(vsubq_f16(_p0, _max0)); - _p1 = exp_ps(vsubq_f16(_p1, _max1)); + _p0 = exp_ps_f16(vsubq_f16(_p0, _max0)); + _p1 = exp_ps_f16(vsubq_f16(_p1, _max1)); vst1q_f16(ptr, _p0); vst1q_f16(ptr + 8, _p1); ptr += 16; @@ -685,7 +686,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) { float16x4_t _p = vld1_f16(ptr); float16x4_t _max = vdup_n_f16(*maxptr); - _p = exp_ps(vsub_f16(_p, _max)); + _p = exp_ps_f16(vsub_f16(_p, _max)); vst1_f16(ptr, _p); ptr += 4; maxptr++; @@ -698,7 +699,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) { float16x8_t _p = vld1q_f16(ptr); float16x8_t _max = vld1q_f16(maxptr); - _p = exp_ps(vsubq_f16(_p, _max)); + _p = exp_ps_f16(vsubq_f16(_p, _max)); vst1q_f16(ptr, _p); ptr += 8; maxptr += 8; @@ -707,7 +708,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) { float16x4_t _p = vld1_f16(ptr); float16x4_t _max = vld1_f16(maxptr); - _p = exp_ps(vsub_f16(_p, _max)); + _p = exp_ps_f16(vsub_f16(_p, _max)); vst1_f16(ptr, _p); ptr += 4; maxptr += 4; @@ -943,7 +944,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) float16x8_t _p = vld1q_f16(ptr); float16x8_t _max = vld1q_f16(maxptr); float16x8_t _sum = vld1q_f16(sumptr); - _p = exp_ps(vsubq_f16(_p, _max)); + _p = exp_ps_f16(vsubq_f16(_p, _max)); _sum = vaddq_f16(_sum, _p); vst1q_f16(ptr, _p); vst1q_f16(sumptr, _sum); @@ -956,7 +957,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) float16x4_t _p = vld1_f16(ptr); float16x4_t _max = vld1_f16(maxptr); float16x4_t _sum = vld1_f16(sumptr); - _p = exp_ps(vsub_f16(_p, _max)); + _p = exp_ps_f16(vsub_f16(_p, _max)); _sum = vadd_f16(_sum, _p); vst1_f16(ptr, _p); vst1_f16(sumptr, _sum); @@ -1070,7 +1071,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) for (; j + 7 < size; j += 8) { float16x8_t _p = vld1q_f16(ptr); - _p = exp_ps(vsubq_f16(_p, _max)); + _p = exp_ps_f16(vsubq_f16(_p, _max)); vst1q_f16(ptr, _p); _sum = vaddq_f16(_sum, _p); ptr += 8; @@ -1078,7 +1079,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) for (; j + 3 < size; j += 4) { float16x4_t _p = vld1_f16(ptr); - _p = exp_ps(vsub_f16(_p, vget_low_f16(_max))); + _p = exp_ps_f16(vsub_f16(_p, vget_low_f16(_max))); vst1_f16(ptr, _p); _sum = vcombine_f16(vadd_f16(vget_low_f16(_sum), _p), vget_high_f16(_sum)); ptr += 4; diff --git a/src/layer/arm/swish_arm_asimdhp.cpp b/src/layer/arm/swish_arm_asimdhp.cpp index 4aee8a898c4..6cda2d0dd72 100644 --- a/src/layer/arm/swish_arm_asimdhp.cpp +++ b/src/layer/arm/swish_arm_asimdhp.cpp @@ -16,6 +16,7 @@ #if __ARM_NEON #include +#include "arm_usability.h" #include "neon_mathfun.h" #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #include "neon_mathfun_fp16s.h" @@ -128,8 +129,8 @@ int Swish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c { float16x8_t _p0 = vld1q_f16(ptr); float16x8_t _p1 = vld1q_f16(ptr + 8); - _p0 = vdivq_f16(_p0, vaddq_f16(_one, exp_ps(vnegq_f16(_p0)))); - _p1 = vdivq_f16(_p1, vaddq_f16(_one, exp_ps(vnegq_f16(_p1)))); + _p0 = vdivq_f16(_p0, vaddq_f16(_one, exp_ps_f16(vnegq_f16(_p0)))); + _p1 = vdivq_f16(_p1, vaddq_f16(_one, exp_ps_f16(vnegq_f16(_p1)))); vst1q_f16(ptr, _p0); vst1q_f16(ptr + 8, _p1); ptr += 16; @@ -137,21 +138,21 @@ int Swish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c for (; i + 7 < size; i += 8) { float16x8_t _p = vld1q_f16(ptr); - _p = vdivq_f16(_p, vaddq_f16(_one, exp_ps(vnegq_f16(_p)))); + _p = vdivq_f16(_p, vaddq_f16(_one, exp_ps_f16(vnegq_f16(_p)))); vst1q_f16(ptr, _p); ptr += 8; } for (; i + 3 < size; i += 4) { float16x4_t _p = vld1_f16(ptr); - _p = vdiv_f16(_p, vadd_f16(vget_low_f16(_one), exp_ps(vneg_f16(_p)))); + _p = vdiv_f16(_p, vadd_f16(vget_low_f16(_one), exp_ps_f16(vneg_f16(_p)))); vst1_f16(ptr, _p); ptr += 4; } for (; i < size; i++) { __fp16 v = *ptr; - v = v / ((__fp16)1.f + expf(-v)); + v = v / (__fp16)(1.f + expf(-v)); *ptr = v; ptr++; diff --git a/src/layer/arm/tanh_arm_asimdhp.cpp b/src/layer/arm/tanh_arm_asimdhp.cpp index 10f3303a1ce..42194a945af 100644 --- a/src/layer/arm/tanh_arm_asimdhp.cpp +++ b/src/layer/arm/tanh_arm_asimdhp.cpp @@ -16,6 +16,7 @@ #if __ARM_NEON #include +#include "arm_usability.h" #include "neon_mathfun.h" #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #include "neon_mathfun_fp16s.h" @@ -99,7 +100,7 @@ int TanH_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co for (int i = 0; i < size; i++) { float16x8_t _p = vld1q_f16(ptr); - _p = tanh_ps(_p); + _p = tanh_ps_f16(_p); vst1q_f16(ptr, _p); ptr += 8; @@ -119,7 +120,7 @@ int TanH_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co for (int i = 0; i < size; i++) { float16x4_t _p = vld1_f16(ptr); - _p = tanh_ps(_p); + _p = tanh_ps_f16(_p); vst1_f16(ptr, _p); ptr += 4; @@ -138,7 +139,7 @@ int TanH_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co for (; i + 3 < size; i += 4) { float16x4_t _p = vld1_f16(ptr); - _p = tanh_ps(_p); + _p = tanh_ps_f16(_p); vst1_f16(ptr, _p); ptr += 4; diff --git a/src/layer/arm/unaryop_arm_asimdhp.cpp b/src/layer/arm/unaryop_arm_asimdhp.cpp index ac64fc708f9..f42848226c0 100644 --- a/src/layer/arm/unaryop_arm_asimdhp.cpp +++ b/src/layer/arm/unaryop_arm_asimdhp.cpp @@ -19,6 +19,7 @@ #if __ARM_NEON #include +#include "arm_usability.h" #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #include "neon_mathfun_fp16s.h" #endif @@ -181,7 +182,7 @@ struct unary_op_rsqrt_fp16s { __fp16 func(const __fp16& x) const { - return (__fp16)1.f / sqrtf(x); + return (__fp16)1.f / (__fp16)sqrtf(x); } float16x4_t func_pack4(const float16x4_t& x) const { @@ -207,11 +208,11 @@ struct unary_op_exp_fp16s } float16x4_t func_pack4(const float16x4_t& x) const { - return exp_ps(x); + return exp_ps_f16(x); } float16x8_t func_pack8(const float16x8_t& x) const { - return exp_ps(x); + return exp_ps_f16(x); } }; @@ -223,11 +224,11 @@ struct unary_op_log_fp16s } float16x4_t func_pack4(const float16x4_t& x) const { - return log_ps(x); + return log_ps_f16(x); } float16x8_t func_pack8(const float16x8_t& x) const { - return log_ps(x); + return log_ps_f16(x); } }; @@ -239,11 +240,11 @@ struct unary_op_sin_fp16s } float16x4_t func_pack4(const float16x4_t& x) const { - return sin_ps(x); + return sin_ps_f16(x); } float16x8_t func_pack8(const float16x8_t& x) const { - return sin_ps(x); + return sin_ps_f16(x); } }; @@ -255,11 +256,11 @@ struct unary_op_cos_fp16s } float16x4_t func_pack4(const float16x4_t& x) const { - return cos_ps(x); + return cos_ps_f16(x); } float16x8_t func_pack8(const float16x8_t& x) const { - return cos_ps(x); + return cos_ps_f16(x); } }; @@ -429,11 +430,11 @@ struct unary_op_tanh_fp16s } float16x4_t func_pack4(const float16x4_t& x) const { - return tanh_ps(x); + return tanh_ps_f16(x); } float16x8_t func_pack8(const float16x8_t& x) const { - return tanh_ps(x); + return tanh_ps_f16(x); } }; @@ -445,11 +446,11 @@ struct unary_op_log10_fp16s } float16x4_t func_pack4(const float16x4_t& x) const { - return vmul_f16(log_ps(x), vdup_n_f16(0.434294481903)); + return vmul_f16(log_ps_f16(x), vdup_n_f16(0.434294481903)); } float16x8_t func_pack8(const float16x8_t& x) const { - return vmulq_f16(log_ps(x), vdupq_n_f16(0.434294481903)); + return vmulq_f16(log_ps_f16(x), vdupq_n_f16(0.434294481903)); } };