From 19a7a1a2f0eee27aa6fac93f1fe36ad89c1b59ef Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Mon, 27 Nov 2023 17:11:26 +0800
Subject: [PATCH] fix build with msvc arm64 asimdhp

---
 CMakeLists.txt                                |  58 ++---
 src/layer/arm/arm_activation.h                |  28 ++-
 src/layer/arm/arm_usability.h                 | 109 ++++++++
 src/layer/arm/batchnorm_arm_asimdhp.cpp       |  16 +-
 src/layer/arm/binaryop_arm_asimdhp.cpp        |   1 +
 src/layer/arm/clip_arm_asimdhp.cpp            |   1 +
 src/layer/arm/convolution1d_arm.cpp           |   2 +-
 src/layer/arm/convolution1d_packed_fp16s.h    |  10 +-
 .../arm/convolution_3x3_winograd_fp16s.h      | 232 +++++++++---------
 src/layer/arm/convolution_arm_asimdhp.cpp     |  14 ++
 src/layer/arm/convolution_packed_fp16s.h      |  10 +-
 .../arm/convolutiondepthwise_arm_asimdhp.cpp  |   6 +-
 src/layer/arm/deconvolution_arm_asimdhp.cpp   |  18 +-
 .../deconvolutiondepthwise_arm_asimdhp.cpp    |   6 +-
 src/layer/arm/dequantize_arm_asimdhp.cpp      |   1 +
 src/layer/arm/eltwise_arm_asimdhp.cpp         |   1 +
 src/layer/arm/gelu_arm_asimdhp.cpp            |   5 +-
 src/layer/arm/gemm_arm_asimdhp.cpp            |   2 +-
 src/layer/arm/hardsigmoid_arm_asimdhp.cpp     |   1 +
 src/layer/arm/hardswish_arm_asimdhp.cpp       |   1 +
 src/layer/arm/innerproduct_arm_asimdhp.cpp    |  64 ++---
 src/layer/arm/instancenorm_arm_asimdhp.cpp    |  14 +-
 src/layer/arm/interp_arm_asimdhp.cpp          |   1 +
 src/layer/arm/interp_bicubic_fp16s.h          |   6 +-
 src/layer/arm/mish_arm_asimdhp.cpp            |   9 +-
 src/layer/arm/neon_mathfun_fp16s.h            |  89 +++++--
 src/layer/arm/pooling_arm_asimdhp.cpp         |  23 +-
 src/layer/arm/prelu_arm_asimdhp.cpp           |   9 +
 src/layer/arm/relu_arm_asimdhp.cpp            |   1 +
 src/layer/arm/sigmoid_arm_asimdhp.cpp         |  17 +-
 src/layer/arm/softmax_arm_asimdhp.cpp         |  57 ++---
 src/layer/arm/swish_arm_asimdhp.cpp           |  11 +-
 src/layer/arm/tanh_arm_asimdhp.cpp            |   7 +-
 src/layer/arm/unaryop_arm_asimdhp.cpp         |  27 +-
 34 files changed, 548 insertions(+), 309 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4404b5a9024..86ca5f6e6e2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -183,35 +183,35 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm")
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.0")
             check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)
 
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
-            # check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16x8_t _s, _a, _b; _s = vfmaq_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
-            # check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
-            # check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4")
-            # check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4")
-            # check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
-            # check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat16_t _s, _a, _b; svbool_t bp; _s = svmla_f16_z(bp, _s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
-            # check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint16_t _s; svint8_t _a, _b; _s = svmlslb_s16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE2)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
-            # check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s; svbfloat16_t _a, _b; _s = svbfmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
-            # check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint32_t _s; svint8_t _a, _b; _s = svmmla_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
-            # check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s, _a, _b; _s = svmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
+            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16x8_t _s, _a, _b; _s = vfmaq_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
+            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
+            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4")
+            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4")
+            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
+            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat16_t _s, _a, _b; svbool_t bp; _s = svmla_f16_z(bp, _s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
+            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint16_t _s; svint8_t _a, _b; _s = svmlslb_s16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE2)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
+            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s; svbfloat16_t _a, _b; _s = svbfmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
+            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint32_t _s; svint8_t _a, _b; _s = svmmla_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
+            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s, _a, _b; _s = svmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)
 
             unset(CMAKE_REQUIRED_FLAGS)
         else()
diff --git a/src/layer/arm/arm_activation.h b/src/layer/arm/arm_activation.h
index d1407e8f165..aca3e57479f 100644
--- a/src/layer/arm/arm_activation.h
+++ b/src/layer/arm/arm_activation.h
@@ -68,9 +68,10 @@ static inline float32x4_t activation_ps(float32x4_t _v, int activation_type, con
 }
 
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#include "arm_usability.h"
 #include "neon_mathfun_fp16s.h"
 
-static inline __fp16 activation_ss(__fp16 v, int activation_type, const ncnn::Mat& activation_params)
+static inline __fp16 activation_ss_f16(__fp16 v, int activation_type, const ncnn::Mat& activation_params)
 {
     if (activation_type == 1)
     {
@@ -92,11 +93,11 @@ static inline __fp16 activation_ss(__fp16 v, int activation_type, const ncnn::Ma
     }
     else if (activation_type == 4)
     {
-        v = (__fp16)1.f / ((__fp16)1.f + expf(-v));
+        v = (__fp16)1.f / ((__fp16)1.f + (__fp16)expf(-v));
     }
     else if (activation_type == 5)
     {
-        v = v * tanhf(logf(expf(v) + (__fp16)1.f));
+        v = v * (__fp16)tanhf(logf(expf((float)v) + 1.f));
     }
     else if (activation_type == 6)
     {
@@ -115,7 +116,7 @@ static inline __fp16 activation_ss(__fp16 v, int activation_type, const ncnn::Ma
     return v;
 }
 
-static inline float16x4_t activation_ps(float16x4_t _v, int activation_type, const ncnn::Mat& activation_params)
+static inline float16x4_t activation_ps_f16(float16x4_t _v, int activation_type, const ncnn::Mat& activation_params)
 {
     if (activation_type == 1)
     {
@@ -125,7 +126,11 @@ static inline float16x4_t activation_ps(float16x4_t _v, int activation_type, con
     else if (activation_type == 2)
     {
         const float16x4_t _zero = vdup_n_f16(0.f);
+#if _MSC_VER
+        const float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(activation_params[0]));
+#else
         const float16x4_t _slope = vdup_n_f16((__fp16)activation_params[0]);
+#endif
         const uint16x4_t _lemask = vcle_f16(_v, _zero);
         float16x4_t _ps = vmul_f16(_v, _slope);
         _v = vbsl_f16(_lemask, _ps, _v);
@@ -139,11 +144,11 @@ static inline float16x4_t activation_ps(float16x4_t _v, int activation_type, con
     }
     else if (activation_type == 4)
     {
-        _v = sigmoid_ps(_v);
+        _v = sigmoid_ps_f16(_v);
     }
     else if (activation_type == 5)
     {
-        _v = vmul_f16(_v, tanh_ps(log_ps(vadd_f16(exp_ps(_v), vdup_n_f16(1.f)))));
+        _v = vmul_f16(_v, tanh_ps_f16(log_ps_f16(vadd_f16(exp_ps_f16(_v), vdup_n_f16(1.f)))));
     }
     else if (activation_type == 6)
     {
@@ -161,7 +166,7 @@ static inline float16x4_t activation_ps(float16x4_t _v, int activation_type, con
     return _v;
 }
 
-static inline float16x8_t activation_ps(float16x8_t _v, int activation_type, const ncnn::Mat& activation_params)
+static inline float16x8_t activation_ps_f16(float16x8_t _v, int activation_type, const ncnn::Mat& activation_params)
 {
     if (activation_type == 1)
     {
@@ -171,7 +176,12 @@ static inline float16x8_t activation_ps(float16x8_t _v, int activation_type, con
     else if (activation_type == 2)
     {
         const float16x8_t _zero = vdupq_n_f16(0.f);
+#if _MSC_VER
+        const float16x4_t _slope0 = vcvt_f16_f32(vdupq_n_f32(activation_params[0]));
+        const float16x8_t _slope = vcombine_f16(_slope0, _slope0);
+#else
         const float16x8_t _slope = vdupq_n_f16((__fp16)activation_params[0]);
+#endif
         const uint16x8_t _lemask = vcleq_f16(_v, _zero);
         float16x8_t _ps = vmulq_f16(_v, _slope);
         _v = vbslq_f16(_lemask, _ps, _v);
@@ -185,11 +195,11 @@ static inline float16x8_t activation_ps(float16x8_t _v, int activation_type, con
     }
     else if (activation_type == 4)
     {
-        _v = sigmoid_ps(_v);
+        _v = sigmoid_ps_f16(_v);
     }
     else if (activation_type == 5)
     {
-        _v = vmulq_f16(_v, tanh_ps(log_ps(vaddq_f16(exp_ps(_v), vdupq_n_f16(1.f)))));
+        _v = vmulq_f16(_v, tanh_ps_f16(log_ps_f16(vaddq_f16(exp_ps_f16(_v), vdupq_n_f16(1.f)))));
     }
     else if (activation_type == 6)
     {
diff --git a/src/layer/arm/arm_usability.h b/src/layer/arm/arm_usability.h
index eb042d8ef34..4cd029caeed 100644
--- a/src/layer/arm/arm_usability.h
+++ b/src/layer/arm/arm_usability.h
@@ -123,6 +123,115 @@ static inline int8x8_t float2int8leakyrelu(float32x4_t _vlow, float32x4_t _vhigh
 }
 
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef _MSC_VER
+struct __fp16
+{
+    __fp16()
+    {
+        _u16 = 0;
+    }
+
+    __fp16(float f32)
+    {
+        _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
+    }
+
+    __fp16(__n16 n16)
+    {
+        _u16 = n16.n16_u16[0];
+    }
+
+    operator const float() const
+    {
+        return vgetq_lane_f32(vcvt_f32_f16(vreinterpretq_f16_u16(vdup_n_u16(_u16))), 0);
+    }
+
+    __fp16& operator+=(const __fp16& b)
+    {
+        float a = (float)*this;
+        float f32 = (a + (float)b);
+        _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
+        return *this;
+    }
+
+    __fp16& operator-=(const __fp16& b)
+    {
+        float a = (float)*this;
+        float f32 = (a - (float)b);
+        _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
+        return *this;
+    }
+
+    __fp16& operator*=(const __fp16& b)
+    {
+        float a = (float)*this;
+        float f32 = (a * (float)b);
+        _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
+        return *this;
+    }
+
+    __fp16& operator/=(const __fp16& b)
+    {
+        float a = (float)*this;
+        float f32 = (a / (float)b);
+        _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
+        return *this;
+    }
+
+    unsigned short _u16;
+};
+
+static inline __fp16 operator-(const __fp16& a) { return __fp16(-(float)a); }
+static inline __fp16 operator+(const __fp16& a, const __fp16& b) { return __fp16((float)a + (float)b); }
+static inline __fp16 operator-(const __fp16& a, const __fp16& b) { return __fp16((float)a - (float)b); }
+static inline __fp16 operator*(const __fp16& a, const __fp16& b) { return __fp16((float)a * (float)b); }
+static inline __fp16 operator/(const __fp16& a, const __fp16& b) { return __fp16((float)a / (float)b); }
+
+static inline float16x4_t vdup_n_f16(const __fp16& f16)
+{
+    return vreinterpret_f16_u16(vdup_n_u16(f16._u16));
+}
+
+static inline float16x8_t vdupq_n_f16(const __fp16& f16)
+{
+    return vreinterpretq_f16_u16(vdupq_n_u16(f16._u16));
+}
+
+static inline __fp16 vmaxv_f16(float16x4_t a)
+{
+    return __fp16(vmaxvq_f32(vcvt_f32_f16(a)));
+}
+
+static inline __fp16 vmaxvq_f16(float16x8_t a)
+{
+    return __fp16(vmaxvq_f32(vcvt_f32_f16(vget_low_f16(a))) + vmaxvq_f32(vcvt_f32_f16(vget_high_f16(a))));
+}
+
+#define vld1q_f16 vld1q_u16
+#define vst1q_f16 vst1q_u16
+
+#define vld2_f16 vld2_u16
+#define vst2_f16 vst2_u16
+
+#define vld2q_f16 vld2q_u16
+#define vst2q_f16 vst2q_u16
+
+#define vld4_f16 vld4_u16
+#define vst4_f16 vst4_u16
+
+#define vld4q_f16 vld4q_u16
+#define vst4q_f16 vst4q_u16
+
+#define vld1q_dup_f16 vld1q_dup_u16
+
+#define vset_lane_f16(x, v, i)  vset_lane_u16(x._u16, (uint16x4_t)v, i)
+#define vsetq_lane_f16(x, v, i) vsetq_lane_u16(x._u16, (uint16x8_t)v, i)
+
+#define vfma_n_f16(va, vb, x) vfma_f16(va, vb, vdup_n_f16(x))
+#define vfmaq_n_f16(va, vb, x) vfmaq_f16(va, vb, vdupq_n_f16(x))
+
+#endif
+
 static inline signed char float2int8(__fp16 v)
 {
     int int32 = round(v);
diff --git a/src/layer/arm/batchnorm_arm_asimdhp.cpp b/src/layer/arm/batchnorm_arm_asimdhp.cpp
index 2fce9b706d5..3bfc1dbafec 100644
--- a/src/layer/arm/batchnorm_arm_asimdhp.cpp
+++ b/src/layer/arm/batchnorm_arm_asimdhp.cpp
@@ -18,6 +18,8 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON
 
+#include "arm_usability.h"
+
 namespace ncnn {
 
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -109,7 +111,7 @@ int BatchNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int i = 0; i < w; i++)
         {
-            ptr[i] = b_data[i] * ptr[i] + a_data[i];
+            ptr[i] = b_data[i] * (float)ptr[i] + a_data[i];
         }
     }
 
@@ -140,7 +142,7 @@ int BatchNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt
             }
             for (; j < w; j++)
             {
-                *ptr = b * *ptr + a;
+                *ptr = b * (float)*ptr + a;
 
                 ptr++;
             }
@@ -177,7 +179,7 @@ int BatchNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt
             }
             for (; j < size; j++)
             {
-                *ptr = b * *ptr + a;
+                *ptr = b * (float)*ptr + a;
 
                 ptr++;
             }
@@ -367,7 +369,11 @@ int BatchNorm_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
             __fp16 b = (__fp16)b_data[i];
 
             float16x4_t _a = vdup_n_f16(a);
+#if _MSC_VER
+            float16x4_t _b = vcvt_f16_f32(vdupq_n_f32(b_data[i]));
+#else
             float16x4_t _b = vdup_n_f16(b);
+#endif
 
             int j = 0;
             for (; j + 3 < w; j += 4)
@@ -404,7 +410,11 @@ int BatchNorm_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
             __fp16 b = (__fp16)b_data[q];
 
             float16x4_t _a = vdup_n_f16(a);
+#if _MSC_VER
+            float16x4_t _b = vcvt_f16_f32(vdupq_n_f32(b_data[q]));
+#else
             float16x4_t _b = vdup_n_f16(b);
+#endif
 
             int j = 0;
             for (; j + 3 < size; j += 4)
diff --git a/src/layer/arm/binaryop_arm_asimdhp.cpp b/src/layer/arm/binaryop_arm_asimdhp.cpp
index b9a8ea2d00b..8c49cadb88c 100644
--- a/src/layer/arm/binaryop_arm_asimdhp.cpp
+++ b/src/layer/arm/binaryop_arm_asimdhp.cpp
@@ -17,6 +17,7 @@
 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
+#include "arm_usability.h"
 #endif // __ARM_NEON
 
 namespace ncnn {
diff --git a/src/layer/arm/clip_arm_asimdhp.cpp b/src/layer/arm/clip_arm_asimdhp.cpp
index 3e4dc7bcd64..55040c3b6bd 100644
--- a/src/layer/arm/clip_arm_asimdhp.cpp
+++ b/src/layer/arm/clip_arm_asimdhp.cpp
@@ -16,6 +16,7 @@
 
 #ifdef __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #endif // __ARM_NEON
 
 namespace ncnn {
diff --git a/src/layer/arm/convolution1d_arm.cpp b/src/layer/arm/convolution1d_arm.cpp
index ab480aec62a..48368fb9cc6 100644
--- a/src/layer/arm/convolution1d_arm.cpp
+++ b/src/layer/arm/convolution1d_arm.cpp
@@ -18,8 +18,8 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON
 
-#include "arm_activation.h"
 #include "arm_usability.h"
+#include "arm_activation.h"
 
 #include "cpu.h"
 #include "layer_type.h"
diff --git a/src/layer/arm/convolution1d_packed_fp16s.h b/src/layer/arm/convolution1d_packed_fp16s.h
index e55c45f573d..7b0c3e4975c 100644
--- a/src/layer/arm/convolution1d_packed_fp16s.h
+++ b/src/layer/arm/convolution1d_packed_fp16s.h
@@ -1378,7 +1378,7 @@ static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
             _sum2 = vaddq_f16(_sum2, _sum3);
             _sum0 = vaddq_f16(_sum0, _sum2);
 
-            _sum0 = activation_ps(_sum0, activation_type, activation_params);
+            _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);
 
             if (out_elempack == 8)
             {
@@ -1571,7 +1571,7 @@ static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
             _sum2 = vadd_f16(_sum2, _sum3);
             _sum0 = vadd_f16(_sum0, _sum2);
 
-            _sum0 = activation_ps(_sum0, activation_type, activation_params);
+            _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);
 
             if (out_elempack == 4)
             {
@@ -1737,8 +1737,8 @@ static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
             sum0 += vget_lane_f16(_ss, 0);
             sum1 += vget_lane_f16(_ss, 1);
 
-            sum0 = activation_ss(sum0, activation_type, activation_params);
-            sum1 = activation_ss(sum1, activation_type, activation_params);
+            sum0 = activation_ss_f16(sum0, activation_type, activation_params);
+            sum1 = activation_ss_f16(sum1, activation_type, activation_params);
 
             outptr0[0] = sum0;
             outptr1[0] = sum1;
@@ -1874,7 +1874,7 @@ static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
             _ss = vpadd_f16(_ss, _ss);
             sum += vget_lane_f16(_ss, 0);
 
-            sum = activation_ss(sum, activation_type, activation_params);
+            sum = activation_ss_f16(sum, activation_type, activation_params);
 
             outptr[0] = sum;
             outptr += 1;
diff --git a/src/layer/arm/convolution_3x3_winograd_fp16s.h b/src/layer/arm/convolution_3x3_winograd_fp16s.h
index 813b81299dd..7332d61a798 100644
--- a/src/layer/arm/convolution_3x3_winograd_fp16s.h
+++ b/src/layer/arm/convolution_3x3_winograd_fp16s.h
@@ -3562,15 +3562,15 @@ static inline void conv3x3s1_winograd43_transform_input_tile_fp16sa(const Mat& b
 
                 __fp16 tmp12a0 = sq2 * r10 + msq2_d2 * r30;
                 __fp16 tmp12a1 = sq2 * r11 + msq2_d2 * r31;
-                __fp16 tmp12b0 = r40 - 2 * r20;
-                __fp16 tmp12b1 = r41 - 2 * r21;
+                __fp16 tmp12b0 = r40 - (__fp16)2.f * r20;
+                __fp16 tmp12b1 = r41 - (__fp16)2.f * r21;
                 __fp16 tmp34a0 = sq2 * r30 + msq2_d2 * r10;
                 __fp16 tmp34a1 = sq2 * r31 + msq2_d2 * r11;
-                __fp16 tmp34b0 = r40 - 0.5f * r20;
-                __fp16 tmp34b1 = r41 - 0.5f * r21;
+                __fp16 tmp34b0 = r40 - (__fp16)0.5f * r20;
+                __fp16 tmp34b1 = r41 - (__fp16)0.5f * r21;
 
-                tmp[0][m][0] = r00 + r40 - 2.5f * r20;
-                tmp[0][m][1] = r01 + r41 - 2.5f * r21;
+                tmp[0][m][0] = r00 + r40 - (__fp16)2.5f * r20;
+                tmp[0][m][1] = r01 + r41 - (__fp16)2.5f * r21;
                 tmp[1][m][0] = tmp12b0 - tmp12a0;
                 tmp[1][m][1] = tmp12b1 - tmp12a1;
                 tmp[2][m][0] = tmp12b0 + tmp12a0;
@@ -3579,8 +3579,8 @@ static inline void conv3x3s1_winograd43_transform_input_tile_fp16sa(const Mat& b
                 tmp[3][m][1] = tmp34b1 + tmp34a1;
                 tmp[4][m][0] = tmp34b0 - tmp34a0;
                 tmp[4][m][1] = tmp34b1 - tmp34a1;
-                tmp[5][m][0] = r10 + r50 - 2.5f * r30;
-                tmp[5][m][1] = r11 + r51 - 2.5f * r31;
+                tmp[5][m][0] = r10 + r50 - (__fp16)2.5f * r30;
+                tmp[5][m][1] = r11 + r51 - (__fp16)2.5f * r31;
 
                 r0 += w;
             }
@@ -3609,15 +3609,15 @@ static inline void conv3x3s1_winograd43_transform_input_tile_fp16sa(const Mat& b
 
                 __fp16 tmp12a0 = sq2 * r10 + msq2_d2 * r30;
                 __fp16 tmp12a1 = sq2 * r11 + msq2_d2 * r31;
-                __fp16 tmp12b0 = r40 - 2 * r20;
-                __fp16 tmp12b1 = r41 - 2 * r21;
+                __fp16 tmp12b0 = r40 - (__fp16)2.f * r20;
+                __fp16 tmp12b1 = r41 - (__fp16)2.f * r21;
                 __fp16 tmp34a0 = sq2 * r30 + msq2_d2 * r10;
                 __fp16 tmp34a1 = sq2 * r31 + msq2_d2 * r11;
-                __fp16 tmp34b0 = r40 - 0.5f * r20;
-                __fp16 tmp34b1 = r41 - 0.5f * r21;
+                __fp16 tmp34b0 = r40 - (__fp16)0.5f * r20;
+                __fp16 tmp34b1 = r41 - (__fp16)0.5f * r21;
 
-                p0[0] = r00 + r40 - 2.5f * r20;
-                p0[1] = r01 + r41 - 2.5f * r21;
+                p0[0] = r00 + r40 - (__fp16)2.5f * r20;
+                p0[1] = r01 + r41 - (__fp16)2.5f * r21;
                 p1[0] = tmp12b0 - tmp12a0;
                 p1[1] = tmp12b1 - tmp12a1;
                 p2[0] = tmp12b0 + tmp12a0;
@@ -3626,8 +3626,8 @@ static inline void conv3x3s1_winograd43_transform_input_tile_fp16sa(const Mat& b
                 p3[1] = tmp34b1 + tmp34a1;
                 p4[0] = tmp34b0 - tmp34a0;
                 p4[1] = tmp34b1 - tmp34a1;
-                p5[0] = r10 + r50 - 2.5f * r30;
-                p5[1] = r11 + r51 - 2.5f * r31;
+                p5[0] = r10 + r50 - (__fp16)2.5f * r30;
+                p5[1] = r11 + r51 - (__fp16)2.5f * r31;
 
                 p0 += max_jj * 6 * 2;
                 p1 += max_jj * 6 * 2;
@@ -3674,16 +3674,16 @@ static inline void conv3x3s1_winograd43_transform_input_tile_fp16sa(const Mat& b
                 }
 
                 __fp16 tmp12a = sq2 * r1 + msq2_d2 * r3;
-                __fp16 tmp12b = r4 - 2 * r2;
+                __fp16 tmp12b = r4 - (__fp16)2.f * r2;
                 __fp16 tmp34a = sq2 * r3 + msq2_d2 * r1;
-                __fp16 tmp34b = r4 - 0.5f * r2;
+                __fp16 tmp34b = r4 - (__fp16)0.5f * r2;
 
-                tmp[0][m] = r0 + r4 - 2.5f * r2;
+                tmp[0][m] = r0 + r4 - (__fp16)2.5f * r2;
                 tmp[1][m] = tmp12b - tmp12a;
                 tmp[2][m] = tmp12b + tmp12a;
                 tmp[3][m] = tmp34b + tmp34a;
                 tmp[4][m] = tmp34b - tmp34a;
-                tmp[5][m] = r1 + r5 - 2.5f * r3;
+                tmp[5][m] = r1 + r5 - (__fp16)2.5f * r3;
 
                 r0123 += w;
             }
@@ -3705,16 +3705,16 @@ static inline void conv3x3s1_winograd43_transform_input_tile_fp16sa(const Mat& b
                 __fp16 r5 = tmp[m][5];
 
                 __fp16 tmp12a = sq2 * r1 + msq2_d2 * r3;
-                __fp16 tmp12b = r4 - 2 * r2;
+                __fp16 tmp12b = r4 - (__fp16)2.f * r2;
                 __fp16 tmp34a = sq2 * r3 + msq2_d2 * r1;
-                __fp16 tmp34b = r4 - 0.5f * r2;
+                __fp16 tmp34b = r4 - (__fp16)0.5f * r2;
 
-                p0[0] = r0 + r4 - 2.5f * r2;
+                p0[0] = r0 + r4 - (__fp16)2.5f * r2;
                 p1[0] = tmp12b - tmp12a;
                 p2[0] = tmp12b + tmp12a;
                 p3[0] = tmp34b + tmp34a;
                 p4[0] = tmp34b - tmp34a;
-                p5[0] = r1 + r5 - 2.5f * r3;
+                p5[0] = r1 + r5 - (__fp16)2.5f * r3;
 
                 p0 += max_jj * 6;
                 p1 += max_jj * 6;
@@ -4107,8 +4107,8 @@ static inline void conv3x3s1_winograd43_transform_output_tile_fp16sa(const Mat&
                 tmp[0][m][1] = r0[1] + tmp02a1 + tmp02b1;
                 tmp[1][m][0] = tmp13a0 * sq2_d2 + tmp13b0 * sq2;
                 tmp[1][m][1] = tmp13a1 * sq2_d2 + tmp13b1 * sq2;
-                tmp[2][m][0] = tmp02a0 * 0.5f + tmp02b0 * 2;
-                tmp[2][m][1] = tmp02a1 * 0.5f + tmp02b1 * 2;
+                tmp[2][m][0] = tmp02a0 * (__fp16)0.5f + tmp02b0 * (__fp16)2;
+                tmp[2][m][1] = tmp02a1 * (__fp16)0.5f + tmp02b1 * (__fp16)2;
                 tmp[3][m][0] = r5[0] + tmp13a0 * sq2_d4 + tmp13b0 * sq2_m2;
                 tmp[3][m][1] = r5[1] + tmp13a1 * sq2_d4 + tmp13b1 * sq2_m2;
 
@@ -4153,8 +4153,8 @@ static inline void conv3x3s1_winograd43_transform_output_tile_fp16sa(const Mat&
                 __fp16 tmp01 = bias1 + r01 + tmp02a1 + tmp02b1;
                 __fp16 tmp10 = bias0 + tmp13a0 * sq2_d2 + tmp13b0 * sq2;
                 __fp16 tmp11 = bias1 + tmp13a1 * sq2_d2 + tmp13b1 * sq2;
-                __fp16 tmp20 = bias0 + tmp02a0 * 0.5f + tmp02b0 * 2;
-                __fp16 tmp21 = bias1 + tmp02a1 * 0.5f + tmp02b1 * 2;
+                __fp16 tmp20 = bias0 + tmp02a0 * (__fp16)0.5f + tmp02b0 * (__fp16)2;
+                __fp16 tmp21 = bias1 + tmp02a1 * (__fp16)0.5f + tmp02b1 * (__fp16)2;
                 __fp16 tmp30 = bias0 + r50 + tmp13a0 * sq2_d4 + tmp13b0 * sq2_m2;
                 __fp16 tmp31 = bias1 + r51 + tmp13a1 * sq2_d4 + tmp13b1 * sq2_m2;
 
@@ -4213,7 +4213,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile_fp16sa(const Mat&
 
                 tmp[0][m] = r0[0] + tmp02a + tmp02b;
                 tmp[1][m] = tmp13a * sq2_d2 + tmp13b * sq2;
-                tmp[2][m] = tmp02a * 0.5f + tmp02b * 2;
+                tmp[2][m] = tmp02a * (__fp16)0.5f + tmp02b * (__fp16)2;
                 tmp[3][m] = r5[0] + tmp13a * sq2_d4 + tmp13b * sq2_m2;
 
                 r0 += max_jj * 6;
@@ -4245,7 +4245,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile_fp16sa(const Mat&
 
                 __fp16 tmp0 = bias0 + r0 + tmp02a + tmp02b;
                 __fp16 tmp1 = bias0 + tmp13a * sq2_d2 + tmp13b * sq2;
-                __fp16 tmp2 = bias0 + tmp02a * 0.5f + tmp02b * 2;
+                __fp16 tmp2 = bias0 + tmp02a * (__fp16)0.5f + tmp02b * (__fp16)2;
                 __fp16 tmp3 = bias0 + r5 + tmp13a * sq2_d4 + tmp13b * sq2_m2;
 
                 // if (out_elempack == 1)
@@ -4993,21 +4993,21 @@ static inline void conv3x3s1_winograd63_transform_input_tile_fp16sa(const Mat& b
                     }
                 }
 
-                __fp16 tmp12a0 = r20 + r60 - r40 * 4.25f;
-                __fp16 tmp12a1 = r21 + r61 - r41 * 4.25f;
-                __fp16 tmp12b0 = r10 + r50 - r30 * 4.25f;
-                __fp16 tmp12b1 = r11 + r51 - r31 * 4.25f;
-                __fp16 tmp34a0 = r60 + r20 * 0.25f - r40 * 1.25f;
-                __fp16 tmp34a1 = r61 + r21 * 0.25f - r41 * 1.25f;
-                __fp16 tmp34b0 = r10 * 0.5f - r30 * 2.5f + r50 * 2.f;
-                __fp16 tmp34b1 = r11 * 0.5f - r31 * 2.5f + r51 * 2.f;
-                __fp16 tmp56a0 = r20 * 4.f - r40 * 5.f + r60;
-                __fp16 tmp56a1 = r21 * 4.f - r41 * 5.f + r61;
-                __fp16 tmp56b0 = r10 * 2.f - r30 * 2.5f + r50 * 0.5f;
-                __fp16 tmp56b1 = r11 * 2.f - r31 * 2.5f + r51 * 0.5f;
-
-                tmp[0][m][0] = r00 - r60 + (r40 - r20) * 5.25f;
-                tmp[0][m][1] = r01 - r61 + (r41 - r21) * 5.25f;
+                __fp16 tmp12a0 = r20 + r60 - r40 * (__fp16)4.25f;
+                __fp16 tmp12a1 = r21 + r61 - r41 * (__fp16)4.25f;
+                __fp16 tmp12b0 = r10 + r50 - r30 * (__fp16)4.25f;
+                __fp16 tmp12b1 = r11 + r51 - r31 * (__fp16)4.25f;
+                __fp16 tmp34a0 = r60 + r20 * (__fp16)0.25f - r40 * (__fp16)1.25f;
+                __fp16 tmp34a1 = r61 + r21 * (__fp16)0.25f - r41 * (__fp16)1.25f;
+                __fp16 tmp34b0 = r10 * (__fp16)0.5f - r30 * (__fp16)2.5f + r50 * (__fp16)2.f;
+                __fp16 tmp34b1 = r11 * (__fp16)0.5f - r31 * (__fp16)2.5f + r51 * (__fp16)2.f;
+                __fp16 tmp56a0 = r20 * (__fp16)4.f - r40 * (__fp16)5.f + r60;
+                __fp16 tmp56a1 = r21 * (__fp16)4.f - r41 * (__fp16)5.f + r61;
+                __fp16 tmp56b0 = r10 * (__fp16)2.f - r30 * (__fp16)2.5f + r50 * (__fp16)0.5f;
+                __fp16 tmp56b1 = r11 * (__fp16)2.f - r31 * (__fp16)2.5f + r51 * (__fp16)0.5f;
+
+                tmp[0][m][0] = r00 - r60 + (r40 - r20) * (__fp16)5.25f;
+                tmp[0][m][1] = r01 - r61 + (r41 - r21) * (__fp16)5.25f;
                 tmp[1][m][0] = tmp12a0 + tmp12b0;
                 tmp[1][m][1] = tmp12a1 + tmp12b1;
                 tmp[2][m][0] = tmp12a0 - tmp12b0;
@@ -5020,8 +5020,8 @@ static inline void conv3x3s1_winograd63_transform_input_tile_fp16sa(const Mat& b
                 tmp[5][m][1] = tmp56a1 + tmp56b1;
                 tmp[6][m][0] = tmp56a0 - tmp56b0;
                 tmp[6][m][1] = tmp56a1 - tmp56b1;
-                tmp[7][m][0] = r70 - r10 + (r30 - r50) * 5.25f;
-                tmp[7][m][1] = r71 - r11 + (r31 - r51) * 5.25f;
+                tmp[7][m][0] = r70 - r10 + (r30 - r50) * (__fp16)5.25f;
+                tmp[7][m][1] = r71 - r11 + (r31 - r51) * (__fp16)5.25f;
 
                 r0 += w;
             }
@@ -5054,21 +5054,21 @@ static inline void conv3x3s1_winograd63_transform_input_tile_fp16sa(const Mat& b
                 __fp16 r70 = tmp[m][7][0];
                 __fp16 r71 = tmp[m][7][1];
 
-                __fp16 tmp12a0 = r20 + r60 - r40 * 4.25f;
-                __fp16 tmp12a1 = r21 + r61 - r41 * 4.25f;
-                __fp16 tmp12b0 = r10 + r50 - r30 * 4.25f;
-                __fp16 tmp12b1 = r11 + r51 - r31 * 4.25f;
-                __fp16 tmp34a0 = r60 + r20 * 0.25f - r40 * 1.25f;
-                __fp16 tmp34a1 = r61 + r21 * 0.25f - r41 * 1.25f;
-                __fp16 tmp34b0 = r10 * 0.5f - r30 * 2.5f + r50 * 2.f;
-                __fp16 tmp34b1 = r11 * 0.5f - r31 * 2.5f + r51 * 2.f;
-                __fp16 tmp56a0 = r20 * 4.f - r40 * 5.f + r60;
-                __fp16 tmp56a1 = r21 * 4.f - r41 * 5.f + r61;
-                __fp16 tmp56b0 = r10 * 2.f - r30 * 2.5f + r50 * 0.5f;
-                __fp16 tmp56b1 = r11 * 2.f - r31 * 2.5f + r51 * 0.5f;
-
-                p0[0] = r00 - r60 + (r40 - r20) * 5.25f;
-                p0[1] = r01 - r61 + (r41 - r21) * 5.25f;
+                __fp16 tmp12a0 = r20 + r60 - r40 * (__fp16)4.25f;
+                __fp16 tmp12a1 = r21 + r61 - r41 * (__fp16)4.25f;
+                __fp16 tmp12b0 = r10 + r50 - r30 * (__fp16)4.25f;
+                __fp16 tmp12b1 = r11 + r51 - r31 * (__fp16)4.25f;
+                __fp16 tmp34a0 = r60 + r20 * (__fp16)0.25f - r40 * (__fp16)1.25f;
+                __fp16 tmp34a1 = r61 + r21 * (__fp16)0.25f - r41 * (__fp16)1.25f;
+                __fp16 tmp34b0 = r10 * (__fp16)0.5f - r30 * (__fp16)2.5f + r50 * (__fp16)2.f;
+                __fp16 tmp34b1 = r11 * (__fp16)0.5f - r31 * (__fp16)2.5f + r51 * (__fp16)2.f;
+                __fp16 tmp56a0 = r20 * (__fp16)4.f - r40 * (__fp16)5.f + r60;
+                __fp16 tmp56a1 = r21 * (__fp16)4.f - r41 * (__fp16)5.f + r61;
+                __fp16 tmp56b0 = r10 * (__fp16)2.f - r30 * (__fp16)2.5f + r50 * (__fp16)0.5f;
+                __fp16 tmp56b1 = r11 * (__fp16)2.f - r31 * (__fp16)2.5f + r51 * (__fp16)0.5f;
+
+                p0[0] = r00 - r60 + (r40 - r20) * (__fp16)5.25f;
+                p0[1] = r01 - r61 + (r41 - r21) * (__fp16)5.25f;
                 p1[0] = tmp12a0 + tmp12b0;
                 p1[1] = tmp12a1 + tmp12b1;
                 p2[0] = tmp12a0 - tmp12b0;
@@ -5081,8 +5081,8 @@ static inline void conv3x3s1_winograd63_transform_input_tile_fp16sa(const Mat& b
                 p5[1] = tmp56a1 + tmp56b1;
                 p6[0] = tmp56a0 - tmp56b0;
                 p6[1] = tmp56a1 - tmp56b1;
-                p7[0] = r70 - r10 + (r30 - r50) * 5.25f;
-                p7[1] = r71 - r11 + (r31 - r51) * 5.25f;
+                p7[0] = r70 - r10 + (r30 - r50) * (__fp16)5.25f;
+                p7[1] = r71 - r11 + (r31 - r51) * (__fp16)5.25f;
 
                 p0 += max_jj * 8 * 2;
                 p1 += max_jj * 8 * 2;
@@ -5134,21 +5134,21 @@ static inline void conv3x3s1_winograd63_transform_input_tile_fp16sa(const Mat& b
                     }
                 }
 
-                __fp16 tmp12a = r2 + r6 - r4 * 4.25f;
-                __fp16 tmp12b = r1 + r5 - r3 * 4.25f;
-                __fp16 tmp34a = r6 + r2 * 0.25f - r4 * 1.25f;
-                __fp16 tmp34b = r1 * 0.5f - r3 * 2.5f + r5 * 2.f;
-                __fp16 tmp56a = r2 * 4.f - r4 * 5.f + r6;
-                __fp16 tmp56b = r1 * 2.f - r3 * 2.5f + r5 * 0.5f;
+                __fp16 tmp12a = r2 + r6 - r4 * (__fp16)4.25f;
+                __fp16 tmp12b = r1 + r5 - r3 * (__fp16)4.25f;
+                __fp16 tmp34a = r6 + r2 * (__fp16)0.25f - r4 * (__fp16)1.25f;
+                __fp16 tmp34b = r1 * (__fp16)0.5f - r3 * (__fp16)2.5f + r5 * (__fp16)2.f;
+                __fp16 tmp56a = r2 * (__fp16)4.f - r4 * (__fp16)5.f + r6;
+                __fp16 tmp56b = r1 * (__fp16)2.f - r3 * (__fp16)2.5f + r5 * (__fp16)0.5f;
 
-                tmp[0][m] = r0 - r6 + (r4 - r2) * 5.25f;
+                tmp[0][m] = r0 - r6 + (r4 - r2) * (__fp16)5.25f;
                 tmp[1][m] = tmp12a + tmp12b;
                 tmp[2][m] = tmp12a - tmp12b;
                 tmp[3][m] = tmp34a + tmp34b;
                 tmp[4][m] = tmp34a - tmp34b;
                 tmp[5][m] = tmp56a + tmp56b;
                 tmp[6][m] = tmp56a - tmp56b;
-                tmp[7][m] = r7 - r1 + (r3 - r5) * 5.25f;
+                tmp[7][m] = r7 - r1 + (r3 - r5) * (__fp16)5.25f;
 
                 r0123 += w;
             }
@@ -5173,21 +5173,21 @@ static inline void conv3x3s1_winograd63_transform_input_tile_fp16sa(const Mat& b
                 __fp16 r6 = tmp[m][6];
                 __fp16 r7 = tmp[m][7];
 
-                __fp16 tmp12a = r2 + r6 - r4 * 4.25f;
-                __fp16 tmp12b = r1 + r5 - r3 * 4.25f;
-                __fp16 tmp34a = r6 + r2 * 0.25f - r4 * 1.25f;
-                __fp16 tmp34b = r1 * 0.5f - r3 * 2.5f + r5 * 2.f;
-                __fp16 tmp56a = r2 * 4.f - r4 * 5.f + r6;
-                __fp16 tmp56b = r1 * 2.f - r3 * 2.5f + r5 * 0.5f;
+                __fp16 tmp12a = r2 + r6 - r4 * (__fp16)4.25f;
+                __fp16 tmp12b = r1 + r5 - r3 * (__fp16)4.25f;
+                __fp16 tmp34a = r6 + r2 * (__fp16)0.25f - r4 * (__fp16)1.25f;
+                __fp16 tmp34b = r1 * (__fp16)0.5f - r3 * (__fp16)2.5f + r5 * (__fp16)2.f;
+                __fp16 tmp56a = r2 * (__fp16)4.f - r4 * (__fp16)5.f + r6;
+                __fp16 tmp56b = r1 * (__fp16)2.f - r3 * (__fp16)2.5f + r5 * (__fp16)0.5f;
 
-                p0[0] = r0 - r6 + (r4 - r2) * 5.25f;
+                p0[0] = r0 - r6 + (r4 - r2) * (__fp16)5.25f;
                 p1[0] = tmp12a + tmp12b;
                 p2[0] = tmp12a - tmp12b;
                 p3[0] = tmp34a + tmp34b;
                 p4[0] = tmp34a - tmp34b;
                 p5[0] = tmp56a + tmp56b;
                 p6[0] = tmp56a - tmp56b;
-                p7[0] = r7 - r1 + (r3 - r5) * 5.25f;
+                p7[0] = r7 - r1 + (r3 - r5) * (__fp16)5.25f;
 
                 p0 += max_jj * 8;
                 p1 += max_jj * 8;
@@ -5670,18 +5670,18 @@ static inline void conv3x3s1_winograd63_transform_output_tile_fp16sa(const Mat&
                 __fp16 tmp135c0 = r5[0] - r6[0];
                 __fp16 tmp135c1 = r5[1] - r6[1];
 
-                tmp[0][m][0] = r0[0] + tmp024a0 + tmp024b0 + tmp024c0 * 32;
-                tmp[0][m][1] = r0[1] + tmp024a1 + tmp024b1 + tmp024c1 * 32;
-                tmp[1][m][0] = tmp135a0 + tmp135b0 + tmp135b0 + tmp135c0 * 16;
-                tmp[1][m][1] = tmp135a1 + tmp135b1 + tmp135b1 + tmp135c1 * 16;
-                tmp[2][m][0] = tmp024a0 + tmp024b0 * 4 + tmp024c0 * 8;
-                tmp[2][m][1] = tmp024a1 + tmp024b1 * 4 + tmp024c1 * 8;
-                tmp[3][m][0] = tmp135a0 + tmp135b0 * 8 + tmp135c0 * 4;
-                tmp[3][m][1] = tmp135a1 + tmp135b1 * 8 + tmp135c1 * 4;
-                tmp[4][m][0] = tmp024a0 + tmp024b0 * 16 + tmp024c0 + tmp024c0;
-                tmp[4][m][1] = tmp024a1 + tmp024b1 * 16 + tmp024c1 + tmp024c1;
-                tmp[5][m][0] = r7[0] + tmp135a0 + tmp135b0 * 32 + tmp135c0;
-                tmp[5][m][1] = r7[1] + tmp135a1 + tmp135b1 * 32 + tmp135c1;
+                tmp[0][m][0] = r0[0] + tmp024a0 + tmp024b0 + tmp024c0 * (__fp16)32;
+                tmp[0][m][1] = r0[1] + tmp024a1 + tmp024b1 + tmp024c1 * (__fp16)32;
+                tmp[1][m][0] = tmp135a0 + tmp135b0 + tmp135b0 + tmp135c0 * (__fp16)16;
+                tmp[1][m][1] = tmp135a1 + tmp135b1 + tmp135b1 + tmp135c1 * (__fp16)16;
+                tmp[2][m][0] = tmp024a0 + tmp024b0 * (__fp16)4 + tmp024c0 * (__fp16)8;
+                tmp[2][m][1] = tmp024a1 + tmp024b1 * (__fp16)4 + tmp024c1 * (__fp16)8;
+                tmp[3][m][0] = tmp135a0 + tmp135b0 * (__fp16)8 + tmp135c0 * (__fp16)4;
+                tmp[3][m][1] = tmp135a1 + tmp135b1 * (__fp16)8 + tmp135c1 * (__fp16)4;
+                tmp[4][m][0] = tmp024a0 + tmp024b0 * (__fp16)16 + tmp024c0 + tmp024c0;
+                tmp[4][m][1] = tmp024a1 + tmp024b1 * (__fp16)16 + tmp024c1 + tmp024c1;
+                tmp[5][m][0] = r7[0] + tmp135a0 + tmp135b0 * (__fp16)32 + tmp135c0;
+                tmp[5][m][1] = r7[1] + tmp135a1 + tmp135b1 * (__fp16)32 + tmp135c1;
 
                 r0 += max_jj * 8 * 2;
                 r1 += max_jj * 8 * 2;
@@ -5730,18 +5730,18 @@ static inline void conv3x3s1_winograd63_transform_output_tile_fp16sa(const Mat&
                 __fp16 tmp135c0 = r50 - r60;
                 __fp16 tmp135c1 = r51 - r61;
 
-                __fp16 tmp00 = bias0 + r00 + tmp024a0 + tmp024b0 + tmp024c0 * 32;
-                __fp16 tmp01 = bias1 + r01 + tmp024a1 + tmp024b1 + tmp024c1 * 32;
-                __fp16 tmp10 = bias0 + tmp135a0 + tmp135b0 + tmp135b0 + tmp135c0 * 16;
-                __fp16 tmp11 = bias1 + tmp135a1 + tmp135b1 + tmp135b1 + tmp135c1 * 16;
-                __fp16 tmp20 = bias0 + tmp024a0 + tmp024b0 * 4 + tmp024c0 * 8;
-                __fp16 tmp21 = bias1 + tmp024a1 + tmp024b1 * 4 + tmp024c1 * 8;
-                __fp16 tmp30 = bias0 + tmp135a0 + tmp135b0 * 8 + tmp135c0 * 4;
-                __fp16 tmp31 = bias1 + tmp135a1 + tmp135b1 * 8 + tmp135c1 * 4;
-                __fp16 tmp40 = bias0 + tmp024a0 + tmp024b0 * 16 + tmp024c0 + tmp024c0;
-                __fp16 tmp41 = bias1 + tmp024a1 + tmp024b1 * 16 + tmp024c1 + tmp024c1;
-                __fp16 tmp50 = bias0 + r70 + tmp135a0 + tmp135b0 * 32 + tmp135c0;
-                __fp16 tmp51 = bias1 + r71 + tmp135a1 + tmp135b1 * 32 + tmp135c1;
+                __fp16 tmp00 = bias0 + r00 + tmp024a0 + tmp024b0 + tmp024c0 * (__fp16)32;
+                __fp16 tmp01 = bias1 + r01 + tmp024a1 + tmp024b1 + tmp024c1 * (__fp16)32;
+                __fp16 tmp10 = bias0 + tmp135a0 + tmp135b0 + tmp135b0 + tmp135c0 * (__fp16)16;
+                __fp16 tmp11 = bias1 + tmp135a1 + tmp135b1 + tmp135b1 + tmp135c1 * (__fp16)16;
+                __fp16 tmp20 = bias0 + tmp024a0 + tmp024b0 * (__fp16)4 + tmp024c0 * (__fp16)8;
+                __fp16 tmp21 = bias1 + tmp024a1 + tmp024b1 * (__fp16)4 + tmp024c1 * (__fp16)8;
+                __fp16 tmp30 = bias0 + tmp135a0 + tmp135b0 * (__fp16)8 + tmp135c0 * (__fp16)4;
+                __fp16 tmp31 = bias1 + tmp135a1 + tmp135b1 * (__fp16)8 + tmp135c1 * (__fp16)4;
+                __fp16 tmp40 = bias0 + tmp024a0 + tmp024b0 * (__fp16)16 + tmp024c0 + tmp024c0;
+                __fp16 tmp41 = bias1 + tmp024a1 + tmp024b1 * (__fp16)16 + tmp024c1 + tmp024c1;
+                __fp16 tmp50 = bias0 + r70 + tmp135a0 + tmp135b0 * (__fp16)32 + tmp135c0;
+                __fp16 tmp51 = bias1 + r71 + tmp135a1 + tmp135b1 * (__fp16)32 + tmp135c1;
 
                 // if (out_elempack == 1)
                 {
@@ -5810,12 +5810,12 @@ static inline void conv3x3s1_winograd63_transform_output_tile_fp16sa(const Mat&
                 __fp16 tmp024c = r5[0] + r6[0];
                 __fp16 tmp135c = r5[0] - r6[0];
 
-                tmp[0][m] = r0[0] + tmp024a + tmp024b + tmp024c * 32;
-                tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * 16;
-                tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 8;
-                tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4;
-                tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c + tmp024c;
-                tmp[5][m] = r7[0] + tmp135a + tmp135b * 32 + tmp135c;
+                tmp[0][m] = r0[0] + tmp024a + tmp024b + tmp024c * (__fp16)32;
+                tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * (__fp16)16;
+                tmp[2][m] = tmp024a + tmp024b * (__fp16)4 + tmp024c * (__fp16)8;
+                tmp[3][m] = tmp135a + tmp135b * (__fp16)8 + tmp135c * (__fp16)4;
+                tmp[4][m] = tmp024a + tmp024b * (__fp16)16 + tmp024c + tmp024c;
+                tmp[5][m] = r7[0] + tmp135a + tmp135b * (__fp16)32 + tmp135c;
 
                 r0 += max_jj * 8;
                 r1 += max_jj * 8;
@@ -5850,12 +5850,12 @@ static inline void conv3x3s1_winograd63_transform_output_tile_fp16sa(const Mat&
                 __fp16 tmp024c = r5 + r6;
                 __fp16 tmp135c = r5 - r6;
 
-                __fp16 tmp0 = bias0 + r0 + tmp024a + tmp024b + tmp024c * 32;
-                __fp16 tmp1 = bias0 + tmp135a + tmp135b + tmp135b + tmp135c * 16;
-                __fp16 tmp2 = bias0 + tmp024a + tmp024b * 4 + tmp024c * 8;
-                __fp16 tmp3 = bias0 + tmp135a + tmp135b * 8 + tmp135c * 4;
-                __fp16 tmp4 = bias0 + tmp024a + tmp024b * 16 + tmp024c + tmp024c;
-                __fp16 tmp5 = bias0 + r7 + tmp135a + tmp135b * 32 + tmp135c;
+                __fp16 tmp0 = bias0 + r0 + tmp024a + tmp024b + tmp024c * (__fp16)32;
+                __fp16 tmp1 = bias0 + tmp135a + tmp135b + tmp135b + tmp135c * (__fp16)16;
+                __fp16 tmp2 = bias0 + tmp024a + tmp024b * (__fp16)4 + tmp024c * (__fp16)8;
+                __fp16 tmp3 = bias0 + tmp135a + tmp135b * (__fp16)8 + tmp135c * (__fp16)4;
+                __fp16 tmp4 = bias0 + tmp024a + tmp024b * (__fp16)16 + tmp024c + tmp024c;
+                __fp16 tmp5 = bias0 + r7 + tmp135a + tmp135b * (__fp16)32 + tmp135c;
 
                 // if (out_elempack == 1)
                 {
diff --git a/src/layer/arm/convolution_arm_asimdhp.cpp b/src/layer/arm/convolution_arm_asimdhp.cpp
index 4d229dc4db7..6480aa2e78a 100644
--- a/src/layer/arm/convolution_arm_asimdhp.cpp
+++ b/src/layer/arm/convolution_arm_asimdhp.cpp
@@ -34,12 +34,14 @@ namespace ncnn {
 #include "convolution_im2col_gemm_bf16s_fp16s.h"
 #include "convolution_im2col_gemm_fp16s.h"
 
+#if NCNN_GNU_INLINE_ASM
 #include "convolution_3x3_pack4_fp16s.h"
 #include "convolution_3x3_pack1to8_fp16s.h"
 #include "convolution_3x3_pack1to4_fp16s.h"
 #include "convolution_3x3_pack8_fp16s.h"
 #include "convolution_5x5_pack8_fp16s.h"
 #include "convolution_7x7_pack1to8_fp16s.h"
+#endif // NCNN_GNU_INLINE_ASM
 #endif
 #endif // __ARM_NEON
 
@@ -122,6 +124,7 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
     int l2_cache_size_fp16 = get_cpu_level2_cache_size() / sizeof(unsigned short);
     bool prefer_sgemm = num_input * num_output * kernel_w * kernel_h * dilation_w * dilation_h * stride_w * stride_h * 2 > l2_cache_size_fp16 || (num_input > 16 || num_output > 16);
 
+#if NCNN_GNU_INLINE_ASM
     if (elempack == 8 && out_elempack == 8)
     {
         if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
@@ -181,6 +184,7 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
             prefer_sgemm = false;
         }
     }
+#endif // NCNN_GNU_INLINE_ASM
 
     if (opt.use_fp16_arithmetic && ((opt.use_sgemm_convolution && prefer_sgemm) || (kernel_w == 1 && kernel_h == 1)))
     {
@@ -196,6 +200,7 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
         return 0;
     }
 
+#if NCNN_GNU_INLINE_ASM
     if ((elempack == 8 && out_elempack == 8 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
             || (elempack == 8 && out_elempack == 8 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
             || (elempack == 8 && out_elempack == 8 && kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
@@ -210,6 +215,7 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
         convolution_transform_kernel_packed_fp16s_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
     }
     else
+#endif // NCNN_GNU_INLINE_ASM
     {
         convolution_transform_kernel_packed_fp16s(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
     }
@@ -399,6 +405,7 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
     int l2_cache_size_fp16 = get_cpu_level2_cache_size() / sizeof(unsigned short);
     bool prefer_sgemm = num_input * num_output * kernel_w * kernel_h * dilation_w * dilation_h * stride_w * stride_h * 2 > l2_cache_size_fp16 || (num_input > 16 || num_output > 16);
 
+#if NCNN_GNU_INLINE_ASM
     if (elempack == 8 && out_elempack == 8)
     {
         if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
@@ -458,6 +465,7 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
             prefer_sgemm = false;
         }
     }
+#endif // NCNN_GNU_INLINE_ASM
 
     if ((opt.use_sgemm_convolution && prefer_sgemm) || (kernel_w == 1 && kernel_h == 1))
     {
@@ -478,6 +486,7 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
         return 0;
     }
 
+#if NCNN_GNU_INLINE_ASM
     if (elempack == 8 && out_elempack == 8)
     {
         if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
@@ -634,6 +643,11 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
             convolution_packed_fp16sa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
         }
     }
+#else  // NCNN_GNU_INLINE_ASM
+    {
+        convolution_packed_fp16sa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
+    }
+#endif // NCNN_GNU_INLINE_ASM
 
     return 0;
 }
diff --git a/src/layer/arm/convolution_packed_fp16s.h b/src/layer/arm/convolution_packed_fp16s.h
index dddb1ec544d..d565fb23fb9 100644
--- a/src/layer/arm/convolution_packed_fp16s.h
+++ b/src/layer/arm/convolution_packed_fp16s.h
@@ -1428,7 +1428,7 @@ static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
                 _sum2 = vaddq_f16(_sum2, _sum3);
                 _sum0 = vaddq_f16(_sum0, _sum2);
 
-                _sum0 = activation_ps(_sum0, activation_type, activation_params);
+                _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);
 
                 if (out_elempack == 8)
                 {
@@ -1621,7 +1621,7 @@ static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
                 _sum2 = vadd_f16(_sum2, _sum3);
                 _sum0 = vadd_f16(_sum0, _sum2);
 
-                _sum0 = activation_ps(_sum0, activation_type, activation_params);
+                _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);
 
                 if (out_elempack == 4)
                 {
@@ -1787,8 +1787,8 @@ static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
                 sum0 += vget_lane_f16(_ss, 0);
                 sum1 += vget_lane_f16(_ss, 1);
 
-                sum0 = activation_ss(sum0, activation_type, activation_params);
-                sum1 = activation_ss(sum1, activation_type, activation_params);
+                sum0 = activation_ss_f16(sum0, activation_type, activation_params);
+                sum1 = activation_ss_f16(sum1, activation_type, activation_params);
 
                 outptr0[0] = sum0;
                 outptr1[0] = sum1;
@@ -1923,7 +1923,7 @@ static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
                 _ss = vpadd_f16(_ss, _ss);
                 sum += vget_lane_f16(_ss, 0);
 
-                sum = activation_ss(sum, activation_type, activation_params);
+                sum = activation_ss_f16(sum, activation_type, activation_params);
 
                 outptr[0] = sum;
                 outptr += 1;
diff --git a/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp b/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp
index 94c5cb0176e..f7d2cfee84c 100644
--- a/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp
@@ -431,7 +431,7 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
                                 _sum = vfmaq_f16(_sum, _val, _w);
                             }
 
-                            _sum = activation_ps(_sum, activation_type, activation_params);
+                            _sum = activation_ps_f16(_sum, activation_type, activation_params);
 
                             vst1q_f16(outptr + j * 8, _sum);
                         }
@@ -493,7 +493,7 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
                                 _sum = vfma_f16(_sum, _val, _w);
                             }
 
-                            _sum = activation_ps(_sum, activation_type, activation_params);
+                            _sum = activation_ps_f16(_sum, activation_type, activation_params);
 
                             vst1_f16(outptr + j * 4, _sum);
                         }
@@ -574,7 +574,7 @@ int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_bl
                                 sum += val * w;
                             }
 
-                            sum = activation_ss(sum, activation_type, activation_params);
+                            sum = activation_ss_f16(sum, activation_type, activation_params);
 
                             outptr[j] = (__fp16)sum;
                         }
diff --git a/src/layer/arm/deconvolution_arm_asimdhp.cpp b/src/layer/arm/deconvolution_arm_asimdhp.cpp
index c786a541f5b..c98ba40309b 100644
--- a/src/layer/arm/deconvolution_arm_asimdhp.cpp
+++ b/src/layer/arm/deconvolution_arm_asimdhp.cpp
@@ -767,7 +767,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
                             kptr += maxk * 64;
                         }
 
-                        _sum = activation_ps(_sum, activation_type, activation_params);
+                        _sum = activation_ps_f16(_sum, activation_type, activation_params);
 
                         vst1q_f16(outptr + j * 8, _sum);
                     }
@@ -838,7 +838,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
                             kptr += maxk * 8;
                         }
 
-                        _sum = activation_ps(_sum, activation_type, activation_params);
+                        _sum = activation_ps_f16(_sum, activation_type, activation_params);
 
                         vst1q_f16(outptr + j * 8, _sum);
                     }
@@ -915,7 +915,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
                             kptr += maxk * 32;
                         }
 
-                        _sum = activation_ps(_sum, activation_type, activation_params);
+                        _sum = activation_ps_f16(_sum, activation_type, activation_params);
 
                         vst1q_f16(outptr + j * 8, _sum);
                     }
@@ -989,7 +989,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
                             kptr += maxk * 8;
                         }
 
-                        sum = activation_ss(sum, activation_type, activation_params);
+                        sum = activation_ss_f16(sum, activation_type, activation_params);
 
                         outptr[j] = (__fp16)sum;
                     }
@@ -1074,7 +1074,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
                             kptr += maxk * 32;
                         }
 
-                        _sum = activation_ps(_sum, activation_type, activation_params);
+                        _sum = activation_ps_f16(_sum, activation_type, activation_params);
 
                         vst1_f16(outptr + j * 4, _sum);
                     }
@@ -1151,7 +1151,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
                             kptr += maxk * 16;
                         }
 
-                        _sum = activation_ps(_sum, activation_type, activation_params);
+                        _sum = activation_ps_f16(_sum, activation_type, activation_params);
 
                         vst1_f16(outptr + j * 4, _sum);
                     }
@@ -1222,7 +1222,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
                             kptr += maxk * 4;
                         }
 
-                        _sum = activation_ps(_sum, activation_type, activation_params);
+                        _sum = activation_ps_f16(_sum, activation_type, activation_params);
 
                         vst1_f16(outptr + j * 4, _sum);
                     }
@@ -1295,7 +1295,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
                             kptr += maxk * 4;
                         }
 
-                        sum = activation_ss(sum, activation_type, activation_params);
+                        sum = activation_ss_f16(sum, activation_type, activation_params);
 
                         outptr[j] = (__fp16)sum;
                     }
@@ -1377,7 +1377,7 @@ int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
                                 kptr += maxk;
                             }
 
-                            sum = activation_ss(sum, activation_type, activation_params);
+                            sum = activation_ss_f16(sum, activation_type, activation_params);
 
                             outptr[j] = (__fp16)sum;
                         }
diff --git a/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp b/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
index 317a7106ece..09e0fca4356 100644
--- a/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
+++ b/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
@@ -464,7 +464,7 @@ int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_
                                 }
                             }
 
-                            _sum = activation_ps(_sum, activation_type, activation_params);
+                            _sum = activation_ps_f16(_sum, activation_type, activation_params);
 
                             vst1q_f16(outptr + j * 8, _sum);
                         }
@@ -528,7 +528,7 @@ int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_
                                 }
                             }
 
-                            _sum = activation_ps(_sum, activation_type, activation_params);
+                            _sum = activation_ps_f16(_sum, activation_type, activation_params);
 
                             vst1_f16(outptr + j * 4, _sum);
                         }
@@ -592,7 +592,7 @@ int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_
                                 }
                             }
 
-                            sum = activation_ss(sum, activation_type, activation_params);
+                            sum = activation_ss_f16(sum, activation_type, activation_params);
 
                             outptr[j] = (__fp16)sum;
                         }
diff --git a/src/layer/arm/dequantize_arm_asimdhp.cpp b/src/layer/arm/dequantize_arm_asimdhp.cpp
index 62977abff9a..3e03c8638dd 100644
--- a/src/layer/arm/dequantize_arm_asimdhp.cpp
+++ b/src/layer/arm/dequantize_arm_asimdhp.cpp
@@ -16,6 +16,7 @@
 
 #if __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #endif // __ARM_NEON
 
 namespace ncnn {
diff --git a/src/layer/arm/eltwise_arm_asimdhp.cpp b/src/layer/arm/eltwise_arm_asimdhp.cpp
index 4335db6c34b..0d9cb62ac07 100644
--- a/src/layer/arm/eltwise_arm_asimdhp.cpp
+++ b/src/layer/arm/eltwise_arm_asimdhp.cpp
@@ -16,6 +16,7 @@
 
 #if __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #endif // __ARM_NEON
 
 namespace ncnn {
diff --git a/src/layer/arm/gelu_arm_asimdhp.cpp b/src/layer/arm/gelu_arm_asimdhp.cpp
index ea8b159cfa8..0b4b59ab5f9 100644
--- a/src/layer/arm/gelu_arm_asimdhp.cpp
+++ b/src/layer/arm/gelu_arm_asimdhp.cpp
@@ -16,6 +16,7 @@
 
 #if __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #include "neon_mathfun.h"
 #if NCNN_ARM82
 #include "neon_mathfun_fp16s.h"
@@ -92,7 +93,7 @@ int GELU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co
             _blob = vmulq_f16(_pLoad, _blob);
             _blob = vmulq_f16(vdupq_n_f16(0.044715f * 0.79788452f), _blob);
             _blob = vfmaq_f16(_blob, vdupq_n_f16(0.79788452f), _pLoad);
-            _blob = tanh_ps(_blob);
+            _blob = tanh_ps_f16(_blob);
             _blob = vaddq_f16(vdupq_n_f16(1.f), _blob);
             _blob = vmulq_f16(vdupq_n_f16(0.5f), vmulq_f16(_blob, _pLoad));
             vst1q_f16(ptr, _blob);
@@ -101,7 +102,7 @@ int GELU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co
 
         for (; i < size; i++)
         {
-            *ptr = (__fp16)0.5f * *ptr * ((__fp16)1.0f + tanhf((__fp16)0.79788452f * (*ptr + (__fp16)0.044715f * *ptr * *ptr * *ptr)));
+            *ptr = (__fp16)0.5f * *ptr * (__fp16)(1.0f + tanhf((__fp16)0.79788452f * (*ptr + (__fp16)0.044715f * *ptr * *ptr * *ptr)));
             ptr++;
         }
     }
diff --git a/src/layer/arm/gemm_arm_asimdhp.cpp b/src/layer/arm/gemm_arm_asimdhp.cpp
index 155658ab077..ff840df3b50 100644
--- a/src/layer/arm/gemm_arm_asimdhp.cpp
+++ b/src/layer/arm/gemm_arm_asimdhp.cpp
@@ -2922,7 +2922,7 @@ int Gemm_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<M
                 __fp16* outptr = CT_data;
                 for (int i = 0; i < size; i++)
                 {
-                    outptr[i] = ptr[i] * beta;
+                    outptr[i] = ptr[i] * (__fp16)beta;
                 }
 
                 C = CT_data;
diff --git a/src/layer/arm/hardsigmoid_arm_asimdhp.cpp b/src/layer/arm/hardsigmoid_arm_asimdhp.cpp
index 723804c4284..0c63db65d31 100644
--- a/src/layer/arm/hardsigmoid_arm_asimdhp.cpp
+++ b/src/layer/arm/hardsigmoid_arm_asimdhp.cpp
@@ -16,6 +16,7 @@
 
 #if __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #endif // __ARM_NEON
 
 namespace ncnn {
diff --git a/src/layer/arm/hardswish_arm_asimdhp.cpp b/src/layer/arm/hardswish_arm_asimdhp.cpp
index 18681676a9f..5ab492ef0d8 100644
--- a/src/layer/arm/hardswish_arm_asimdhp.cpp
+++ b/src/layer/arm/hardswish_arm_asimdhp.cpp
@@ -16,6 +16,7 @@
 
 #if __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #endif // __ARM_NEON
 
 namespace ncnn {
diff --git a/src/layer/arm/innerproduct_arm_asimdhp.cpp b/src/layer/arm/innerproduct_arm_asimdhp.cpp
index b859c307c6e..de475d5ca59 100644
--- a/src/layer/arm/innerproduct_arm_asimdhp.cpp
+++ b/src/layer/arm/innerproduct_arm_asimdhp.cpp
@@ -120,14 +120,14 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons
                         kptr += 8;
                     }
 
-                    _sum0 = activation_ps(_sum0, activation_type, activation_params);
-                    _sum1 = activation_ps(_sum1, activation_type, activation_params);
-                    _sum2 = activation_ps(_sum2, activation_type, activation_params);
-                    _sum3 = activation_ps(_sum3, activation_type, activation_params);
-                    _sum4 = activation_ps(_sum4, activation_type, activation_params);
-                    _sum5 = activation_ps(_sum5, activation_type, activation_params);
-                    _sum6 = activation_ps(_sum6, activation_type, activation_params);
-                    _sum7 = activation_ps(_sum7, activation_type, activation_params);
+                    _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);
+                    _sum1 = activation_ps_f16(_sum1, activation_type, activation_params);
+                    _sum2 = activation_ps_f16(_sum2, activation_type, activation_params);
+                    _sum3 = activation_ps_f16(_sum3, activation_type, activation_params);
+                    _sum4 = activation_ps_f16(_sum4, activation_type, activation_params);
+                    _sum5 = activation_ps_f16(_sum5, activation_type, activation_params);
+                    _sum6 = activation_ps_f16(_sum6, activation_type, activation_params);
+                    _sum7 = activation_ps_f16(_sum7, activation_type, activation_params);
 
                     vst1q_f16(outptr, _sum0);
                     vst1q_f16(outptr + 8, _sum1);
@@ -167,7 +167,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons
                         kptr += 8;
                     }
 
-                    _sum = activation_ps(_sum, activation_type, activation_params);
+                    _sum = activation_ps_f16(_sum, activation_type, activation_params);
 
                     vst1q_f16(outptr, _sum);
                     outptr += 8;
@@ -221,14 +221,14 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons
                         kptr += 8;
                     }
 
-                    _sum0 = activation_ps(_sum0, activation_type, activation_params);
-                    _sum1 = activation_ps(_sum1, activation_type, activation_params);
-                    _sum2 = activation_ps(_sum2, activation_type, activation_params);
-                    _sum3 = activation_ps(_sum3, activation_type, activation_params);
-                    _sum4 = activation_ps(_sum4, activation_type, activation_params);
-                    _sum5 = activation_ps(_sum5, activation_type, activation_params);
-                    _sum6 = activation_ps(_sum6, activation_type, activation_params);
-                    _sum7 = activation_ps(_sum7, activation_type, activation_params);
+                    _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);
+                    _sum1 = activation_ps_f16(_sum1, activation_type, activation_params);
+                    _sum2 = activation_ps_f16(_sum2, activation_type, activation_params);
+                    _sum3 = activation_ps_f16(_sum3, activation_type, activation_params);
+                    _sum4 = activation_ps_f16(_sum4, activation_type, activation_params);
+                    _sum5 = activation_ps_f16(_sum5, activation_type, activation_params);
+                    _sum6 = activation_ps_f16(_sum6, activation_type, activation_params);
+                    _sum7 = activation_ps_f16(_sum7, activation_type, activation_params);
 
                     vst1_f16(outptr, _sum0);
                     vst1_f16(outptr + 4, _sum1);
@@ -268,7 +268,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons
                         kptr += 1;
                     }
 
-                    _sum = activation_ps(_sum, activation_type, activation_params);
+                    _sum = activation_ps_f16(_sum, activation_type, activation_params);
 
                     vst1q_f16(outptr, _sum);
                     outptr += 8;
@@ -310,10 +310,10 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons
                         kptr += 4;
                     }
 
-                    _sum0 = activation_ps(_sum0, activation_type, activation_params);
-                    _sum1 = activation_ps(_sum1, activation_type, activation_params);
-                    _sum2 = activation_ps(_sum2, activation_type, activation_params);
-                    _sum3 = activation_ps(_sum3, activation_type, activation_params);
+                    _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);
+                    _sum1 = activation_ps_f16(_sum1, activation_type, activation_params);
+                    _sum2 = activation_ps_f16(_sum2, activation_type, activation_params);
+                    _sum3 = activation_ps_f16(_sum3, activation_type, activation_params);
 
                     vst1q_f16(outptr, _sum0);
                     vst1q_f16(outptr + 8, _sum1);
@@ -358,10 +358,10 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons
                         kptr += 4;
                     }
 
-                    _sum0 = activation_ps(_sum0, activation_type, activation_params);
-                    _sum1 = activation_ps(_sum1, activation_type, activation_params);
-                    _sum2 = activation_ps(_sum2, activation_type, activation_params);
-                    _sum3 = activation_ps(_sum3, activation_type, activation_params);
+                    _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);
+                    _sum1 = activation_ps_f16(_sum1, activation_type, activation_params);
+                    _sum2 = activation_ps_f16(_sum2, activation_type, activation_params);
+                    _sum3 = activation_ps_f16(_sum3, activation_type, activation_params);
 
                     vst1_f16(outptr, _sum0);
                     vst1_f16(outptr + 4, _sum1);
@@ -397,7 +397,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons
                         kptr += 4;
                     }
 
-                    _sum = activation_ps(_sum, activation_type, activation_params);
+                    _sum = activation_ps_f16(_sum, activation_type, activation_params);
 
                     vst1_f16(outptr, _sum);
                     outptr += 4;
@@ -430,7 +430,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons
                         kptr += 1;
                     }
 
-                    _sum = activation_ps(_sum, activation_type, activation_params);
+                    _sum = activation_ps_f16(_sum, activation_type, activation_params);
 
                     vst1_f16(outptr, _sum);
                     outptr += 4;
@@ -461,7 +461,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons
                         kptr += 1;
                     }
 
-                    sum = activation_ss(sum, activation_type, activation_params);
+                    sum = activation_ss_f16(sum, activation_type, activation_params);
 
                     outptr[0] = (__fp16)sum;
                     outptr += 1;
@@ -614,7 +614,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons
             _sum4 = vaddq_f16(_sum4, _sum6);
             _sum0 = vaddq_f16(_sum0, _sum4);
 
-            _sum0 = activation_ps(_sum0, activation_type, activation_params);
+            _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);
 
             __fp16* outptr = (__fp16*)top_blob;
             vst1q_f16(outptr + p * 8, _sum0);
@@ -739,7 +739,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons
             _sum4 = vadd_f16(_sum4, _sum6);
             _sum0 = vadd_f16(_sum0, _sum4);
 
-            _sum0 = activation_ps(_sum0, activation_type, activation_params);
+            _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);
 
             __fp16* outptr = (__fp16*)top_blob;
             vst1_f16(outptr + p * 4, _sum0);
@@ -787,7 +787,7 @@ int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, cons
             float16x4_t _s4 = vadd_f16(vget_low_f16(_sum), vget_high_f16(_sum));
             sum += vaddvq_f32(vcvt_f32_f16(_s4)); // dot
 
-            sum = activation_ss(sum, activation_type, activation_params);
+            sum = activation_ss_f16(sum, activation_type, activation_params);
 
             __fp16* outptr = (__fp16*)top_blob;
             outptr[p] = (__fp16)sum;
diff --git a/src/layer/arm/instancenorm_arm_asimdhp.cpp b/src/layer/arm/instancenorm_arm_asimdhp.cpp
index 4ea0b6d55b7..67e408122ff 100644
--- a/src/layer/arm/instancenorm_arm_asimdhp.cpp
+++ b/src/layer/arm/instancenorm_arm_asimdhp.cpp
@@ -16,6 +16,7 @@
 
 #if __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #endif // __ARM_NEON
 
 namespace ncnn {
@@ -231,8 +232,9 @@ int InstanceNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option&
 #endif // __ARM_NEON
         for (; i < size; i++)
         {
-            float tmp = *ptr++ - mean;
+            float tmp = (float)*ptr - mean;
             sqsum += tmp * tmp;
+            ptr++;
         }
         float var = sqsum / size;
         // the var maybe minus due to accuracy
@@ -245,13 +247,15 @@ int InstanceNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option&
             float gamma = gamma_data[q];
             float beta = beta_data[q];
 
-            a = (__fp16)(gamma / (sqrtf(var + eps)));
-            b = (__fp16)(-mean * a + beta);
+            float a_fp32 = gamma / (sqrtf(var + eps));
+            a = (__fp16)(a_fp32);
+            b = (__fp16)(-mean * a_fp32 + beta);
         }
         else
         {
-            a = (__fp16)(1.f / (sqrtf(var + eps)));
-            b = (__fp16)(-mean * a);
+            float a_fp32 = 1.f / (sqrtf(var + eps));
+            a = (__fp16)(a_fp32);
+            b = (__fp16)(-mean * a_fp32);
         }
 
         i = 0;
diff --git a/src/layer/arm/interp_arm_asimdhp.cpp b/src/layer/arm/interp_arm_asimdhp.cpp
index 286c74fe40c..b7d78d7bb91 100644
--- a/src/layer/arm/interp_arm_asimdhp.cpp
+++ b/src/layer/arm/interp_arm_asimdhp.cpp
@@ -16,6 +16,7 @@
 
 #if __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #endif // __ARM_NEON
 
 namespace ncnn {
diff --git a/src/layer/arm/interp_bicubic_fp16s.h b/src/layer/arm/interp_bicubic_fp16s.h
index be3e8159d87..18b2e56799e 100644
--- a/src/layer/arm/interp_bicubic_fp16s.h
+++ b/src/layer/arm/interp_bicubic_fp16s.h
@@ -24,7 +24,7 @@ static inline void interpolate_cubic_fp16sa(float fx, __fp16* coeffs)
     coeffs[0] = (__fp16)(A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A);
     coeffs[1] = (__fp16)((A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1);
     coeffs[2] = (__fp16)((A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1);
-    coeffs[3] = (__fp16)(1.f - coeffs[0] - coeffs[1] - coeffs[2]);
+    coeffs[3] = (__fp16)((__fp16)1.f - coeffs[0] - coeffs[1] - coeffs[2]);
 }
 
 static void cubic_coeffs_fp16sa(int w, int outw, int* xofs, __fp16* alpha, int align_corner)
@@ -51,7 +51,7 @@ static void cubic_coeffs_fp16sa(int w, int outw, int* xofs, __fp16* alpha, int a
         if (sx <= -1)
         {
             sx = 1;
-            alpha[dx * 4 + 0] = (__fp16)(1.f - alpha[dx * 4 + 3]);
+            alpha[dx * 4 + 0] = (__fp16)((__fp16)1.f - alpha[dx * 4 + 3]);
             alpha[dx * 4 + 1] = (__fp16)alpha[dx * 4 + 3];
             alpha[dx * 4 + 2] = (__fp16)0.f;
             alpha[dx * 4 + 3] = (__fp16)0.f;
@@ -75,7 +75,7 @@ static void cubic_coeffs_fp16sa(int w, int outw, int* xofs, __fp16* alpha, int a
         if (sx >= w - 1)
         {
             sx = w - 3;
-            alpha[dx * 4 + 3] = (__fp16)(1.f - alpha[dx * 4 + 0]);
+            alpha[dx * 4 + 3] = (__fp16)((__fp16)1.f - alpha[dx * 4 + 0]);
             alpha[dx * 4 + 2] = (__fp16)(alpha[dx * 4 + 0]);
             alpha[dx * 4 + 1] = (__fp16)0.f;
             alpha[dx * 4 + 0] = (__fp16)0.f;
diff --git a/src/layer/arm/mish_arm_asimdhp.cpp b/src/layer/arm/mish_arm_asimdhp.cpp
index 0e04883370e..70dd74ae22c 100644
--- a/src/layer/arm/mish_arm_asimdhp.cpp
+++ b/src/layer/arm/mish_arm_asimdhp.cpp
@@ -16,6 +16,7 @@
 
 #if __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #include "neon_mathfun.h"
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include "neon_mathfun_fp16s.h"
@@ -99,7 +100,7 @@ int Mish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co
             for (int i = 0; i < size; i++)
             {
                 float16x8_t _p = vld1q_f16(ptr);
-                _p = vmulq_f16(_p, tanh_ps(log_ps(vaddq_f16(exp_ps(_p), vdupq_n_f16(1.f)))));
+                _p = vmulq_f16(_p, tanh_ps_f16(log_ps_f16(vaddq_f16(exp_ps_f16(_p), vdupq_n_f16(1.f)))));
                 vst1q_f16(ptr, _p);
 
                 ptr += 8;
@@ -119,7 +120,7 @@ int Mish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co
             for (int i = 0; i < size; i++)
             {
                 float16x4_t _p = vld1_f16(ptr);
-                _p = vmul_f16(_p, tanh_ps(log_ps(vadd_f16(exp_ps(_p), vdup_n_f16(1.f)))));
+                _p = vmul_f16(_p, tanh_ps_f16(log_ps_f16(vadd_f16(exp_ps_f16(_p), vdup_n_f16(1.f)))));
                 vst1_f16(ptr, _p);
 
                 ptr += 4;
@@ -138,7 +139,7 @@ int Mish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co
         for (; i + 3 < size; i += 4)
         {
             float16x4_t _p = vld1_f16(ptr);
-            _p = vmul_f16(_p, tanh_ps(log_ps(vadd_f16(exp_ps(_p), vdup_n_f16(1.f)))));
+            _p = vmul_f16(_p, tanh_ps_f16(log_ps_f16(vadd_f16(exp_ps_f16(_p), vdup_n_f16(1.f)))));
             vst1_f16(ptr, _p);
 
             ptr += 4;
@@ -146,7 +147,7 @@ int Mish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co
         for (; i < size; i++)
         {
             __fp16 v = *ptr;
-            v = v * tanhf(logf(expf(v) + (__fp16)1.f));
+            v = v * (__fp16)tanhf(logf(expf(v) + 1.f));
             *ptr = v;
             ptr++;
         }
diff --git a/src/layer/arm/neon_mathfun_fp16s.h b/src/layer/arm/neon_mathfun_fp16s.h
index 074681809bf..2f4864c13a3 100644
--- a/src/layer/arm/neon_mathfun_fp16s.h
+++ b/src/layer/arm/neon_mathfun_fp16s.h
@@ -61,7 +61,7 @@
 /* natural logarithm computed for 4 simultaneous float
  *   return NaN for x <= 0
  */
-static inline float16x4_t log_ps(float16x4_t x)
+static inline float16x4_t log_ps_f16(float16x4_t x)
 {
     float16x4_t one = vdup_n_f16(1);
 
@@ -119,7 +119,7 @@ static inline float16x4_t log_ps(float16x4_t x)
     return x;
 }
 
-static inline float16x8_t log_ps(float16x8_t x)
+static inline float16x8_t log_ps_f16(float16x8_t x)
 {
     float16x8_t one = vdupq_n_f16(1);
 
@@ -192,7 +192,7 @@ static inline float16x8_t log_ps(float16x8_t x)
 #define c_cephes_exp_p5 5.0000001201E-1
 
 /* exp() computed for 4 float at once */
-static inline float16x4_t exp_ps(float16x4_t x)
+static inline float16x4_t exp_ps_f16(float16x4_t x)
 {
     float16x4_t tmp, fx;
 
@@ -201,7 +201,11 @@ static inline float16x4_t exp_ps(float16x4_t x)
     x = vmax_f16(x, vdup_n_f16(c_exp_lo_f16));
 
     /* express exp(x) as exp(g + n*log(2)) */
+#if _MSC_VER
+    fx = vfma_f16(vdup_n_f16(0.5f), x, vcvt_f16_f32(vdupq_n_f32(c_cephes_LOG2EF)));
+#else
     fx = vfma_f16(vdup_n_f16(0.5f), x, vdup_n_f16(c_cephes_LOG2EF));
+#endif
 
     /* perform a floorf */
     tmp = vcvt_f16_s16(vcvt_s16_f16(fx));
@@ -212,8 +216,13 @@ static inline float16x4_t exp_ps(float16x4_t x)
 
     fx = vsub_f16(tmp, (float16x4_t)(mask));
 
+#if _MSC_VER
+    tmp = vmul_f16(fx, vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1)));
+    float16x4_t z = vmul_f16(fx, vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C2)));
+#else
     tmp = vmul_f16(fx, vdup_n_f16(c_cephes_exp_C1));
     float16x4_t z = vmul_f16(fx, vdup_n_f16(c_cephes_exp_C2));
+#endif
     x = vsub_f16(x, tmp);
     x = vsub_f16(x, z);
 
@@ -240,7 +249,7 @@ static inline float16x4_t exp_ps(float16x4_t x)
     return y;
 }
 
-static inline float16x8_t exp_ps(float16x8_t x)
+static inline float16x8_t exp_ps_f16(float16x8_t x)
 {
     float16x8_t tmp, fx;
 
@@ -249,7 +258,12 @@ static inline float16x8_t exp_ps(float16x8_t x)
     x = vmaxq_f16(x, vdupq_n_f16(c_exp_lo_f16));
 
     /* express exp(x) as exp(g + n*log(2)) */
+#if _MSC_VER
+    float16x4_t _c_cephes_LOG2EF = vcvt_f16_f32(vdupq_n_f32(c_cephes_LOG2EF));
+    fx = vfmaq_f16(vdupq_n_f16(0.5f), x, vcombine_f16(_c_cephes_LOG2EF, _c_cephes_LOG2EF));
+#else
     fx = vfmaq_f16(vdupq_n_f16(0.5f), x, vdupq_n_f16(c_cephes_LOG2EF));
+#endif
 
     /* perform a floorf */
     tmp = vcvtq_f16_s16(vcvtq_s16_f16(fx));
@@ -260,8 +274,15 @@ static inline float16x8_t exp_ps(float16x8_t x)
 
     fx = vsubq_f16(tmp, vreinterpretq_f16_u16(mask));
 
+#if _MSC_VER
+    float16x4_t _c_cephes_exp_C1 = vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1));
+    tmp = vmulq_f16(fx, vcombine_f16(_c_cephes_exp_C1, _c_cephes_exp_C1));
+    float16x4_t _c_cephes_exp_C2 = vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C2));
+    float16x8_t z = vmulq_f16(fx, vcombine_f16(_c_cephes_exp_C2, _c_cephes_exp_C2));
+#else
     tmp = vmulq_f16(fx, vdupq_n_f16(c_cephes_exp_C1));
     float16x8_t z = vmulq_f16(fx, vdupq_n_f16(c_cephes_exp_C2));
+#endif
     x = vsubq_f16(x, tmp);
     x = vsubq_f16(x, z);
 
@@ -314,7 +335,7 @@ static inline float16x8_t exp_ps(float16x8_t x)
  *   almost no extra price so both sin_ps and cos_ps make use of
  *   sincos_ps..
  */
-static inline void sincos_ps(float16x4_t x, float16x4_t* ysin, float16x4_t* ycos)
+static inline void sincos_ps_f16(float16x4_t x, float16x4_t* ysin, float16x4_t* ycos)
 {
     // any x
     float16x4_t y;
@@ -326,7 +347,12 @@ static inline void sincos_ps(float16x4_t x, float16x4_t* ysin, float16x4_t* ycos
     x = vabs_f16(x);
 
     /* scale by 4/Pi */
+#if _MSC_VER
+    float16x4_t _c_cephes_FOPI = vcvt_f16_f32(vdupq_n_f32(c_cephes_FOPI));
+    y = vmul_f16(x, _c_cephes_FOPI);
+#else
     y = vmul_f16(x, vdup_n_f16(c_cephes_FOPI));
+#endif
 
     /* store the integer part of y in mm0 */
     emm2 = vcvt_u16_f16(y);
@@ -345,9 +371,18 @@ static inline void sincos_ps(float16x4_t x, float16x4_t* ysin, float16x4_t* ycos
 
     /* The magic pass: "Extended precision modular arithmetic"
      *     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+#if _MSC_VER
+    float16x4_t _c_minus_cephes_DP1 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP1));
+    float16x4_t _c_minus_cephes_DP2 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP2));
+    float16x4_t _c_minus_cephes_DP3 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP3));
+    x = vfma_f16(x, y, _c_minus_cephes_DP1);
+    x = vfma_f16(x, y, _c_minus_cephes_DP2);
+    x = vfma_f16(x, y, _c_minus_cephes_DP3);
+#else
     x = vfma_f16(x, y, vdup_n_f16(c_minus_cephes_DP1));
     x = vfma_f16(x, y, vdup_n_f16(c_minus_cephes_DP2));
     x = vfma_f16(x, y, vdup_n_f16(c_minus_cephes_DP3));
+#endif
 
     sign_mask_sin = veor_u16(sign_mask_sin, vtst_u16(emm2, vdup_n_u16(4)));
     sign_mask_cos = vtst_u16(vsub_u16(emm2, vdup_n_u16(2)), vdup_n_u16(4));
@@ -375,7 +410,7 @@ static inline void sincos_ps(float16x4_t x, float16x4_t* ysin, float16x4_t* ycos
     *ycos = vbsl_f16(sign_mask_cos, yc, vneg_f16(yc));
 }
 
-static inline void sincos_ps(float16x8_t x, float16x8_t* ysin, float16x8_t* ycos)
+static inline void sincos_ps_f16(float16x8_t x, float16x8_t* ysin, float16x8_t* ycos)
 {
     // any x
     float16x8_t y;
@@ -387,7 +422,12 @@ static inline void sincos_ps(float16x8_t x, float16x8_t* ysin, float16x8_t* ycos
     x = vabsq_f16(x);
 
     /* scale by 4/Pi */
+#if _MSC_VER
+    float16x4_t _c_cephes_FOPI = vcvt_f16_f32(vdupq_n_f32(c_cephes_FOPI));
+    y = vmulq_f16(x, vcombine_f16(_c_cephes_FOPI, _c_cephes_FOPI));
+#else
     y = vmulq_f16(x, vdupq_n_f16(c_cephes_FOPI));
+#endif
 
     /* store the integer part of y in mm0 */
     emm2 = vcvtq_u16_f16(y);
@@ -406,9 +446,18 @@ static inline void sincos_ps(float16x8_t x, float16x8_t* ysin, float16x8_t* ycos
 
     /* The magic pass: "Extended precision modular arithmetic"
      *     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+#if _MSC_VER
+    float16x4_t _c_minus_cephes_DP1 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP1));
+    float16x4_t _c_minus_cephes_DP2 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP2));
+    float16x4_t _c_minus_cephes_DP3 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP3));
+    x = vfmaq_f16(x, y, vcombine_f16(_c_minus_cephes_DP1, _c_minus_cephes_DP1));
+    x = vfmaq_f16(x, y, vcombine_f16(_c_minus_cephes_DP2, _c_minus_cephes_DP2));
+    x = vfmaq_f16(x, y, vcombine_f16(_c_minus_cephes_DP3, _c_minus_cephes_DP3));
+#else
     x = vfmaq_f16(x, y, vdupq_n_f16(c_minus_cephes_DP1));
     x = vfmaq_f16(x, y, vdupq_n_f16(c_minus_cephes_DP2));
     x = vfmaq_f16(x, y, vdupq_n_f16(c_minus_cephes_DP3));
+#endif
 
     sign_mask_sin = veorq_u16(sign_mask_sin, vtstq_u16(emm2, vdupq_n_u16(4)));
     sign_mask_cos = vtstq_u16(vsubq_u16(emm2, vdupq_n_u16(2)), vdupq_n_u16(4));
@@ -436,31 +485,31 @@ static inline void sincos_ps(float16x8_t x, float16x8_t* ysin, float16x8_t* ycos
     *ycos = vbslq_f16(sign_mask_cos, yc, vnegq_f16(yc));
 }
 
-static inline float16x4_t sin_ps(float16x4_t x)
+static inline float16x4_t sin_ps_f16(float16x4_t x)
 {
     float16x4_t ysin, ycos;
-    sincos_ps(x, &ysin, &ycos);
+    sincos_ps_f16(x, &ysin, &ycos);
     return ysin;
 }
 
-static inline float16x8_t sin_ps(float16x8_t x)
+static inline float16x8_t sin_ps_f16(float16x8_t x)
 {
     float16x8_t ysin, ycos;
-    sincos_ps(x, &ysin, &ycos);
+    sincos_ps_f16(x, &ysin, &ycos);
     return ysin;
 }
 
-static inline float16x4_t cos_ps(float16x4_t x)
+static inline float16x4_t cos_ps_f16(float16x4_t x)
 {
     float16x4_t ysin, ycos;
-    sincos_ps(x, &ysin, &ycos);
+    sincos_ps_f16(x, &ysin, &ycos);
     return ycos;
 }
 
-static inline float16x8_t cos_ps(float16x8_t x)
+static inline float16x8_t cos_ps_f16(float16x8_t x)
 {
     float16x8_t ysin, ycos;
-    sincos_ps(x, &ysin, &ycos);
+    sincos_ps_f16(x, &ysin, &ycos);
     return ycos;
 }
 
@@ -481,7 +530,7 @@ static inline float16x8_t cos_ps(float16x8_t x)
 #define c_tanh_beta_6 1.19825839466702e-6f
 
 /* Single precision hyperbolic tangent computed for 4 simultaneous float */
-static inline float16x4_t tanh_ps(float16x4_t x)
+static inline float16x4_t tanh_ps_f16(float16x4_t x)
 {
     float16x4_t x2 = vabs_f16(x);
 
@@ -522,7 +571,7 @@ static inline float16x4_t tanh_ps(float16x4_t x)
     return y;
 }
 
-static inline float16x8_t tanh_ps(float16x8_t x)
+static inline float16x8_t tanh_ps_f16(float16x8_t x)
 {
     float16x8_t x2 = vabsq_f16(x);
 
@@ -563,20 +612,20 @@ static inline float16x8_t tanh_ps(float16x8_t x)
     return y;
 }
 
-static inline float16x4_t sigmoid_ps(float16x4_t _v)
+static inline float16x4_t sigmoid_ps_f16(float16x4_t _v)
 {
     float16x4_t _one = vdup_n_f16(1.f);
     _v = vneg_f16(_v);
-    _v = exp_ps(_v);
+    _v = exp_ps_f16(_v);
     _v = vadd_f16(_v, _one);
     return vdiv_f16(_one, _v);
 }
 
-static inline float16x8_t sigmoid_ps(float16x8_t _v)
+static inline float16x8_t sigmoid_ps_f16(float16x8_t _v)
 {
     float16x8_t _one = vdupq_n_f16(1.f);
     _v = vnegq_f16(_v);
-    _v = exp_ps(_v);
+    _v = exp_ps_f16(_v);
     _v = vaddq_f16(_v, _one);
     return vdivq_f16(_one, _v);
 }
diff --git a/src/layer/arm/pooling_arm_asimdhp.cpp b/src/layer/arm/pooling_arm_asimdhp.cpp
index 856c6c22ae3..ceabb1fda38 100644
--- a/src/layer/arm/pooling_arm_asimdhp.cpp
+++ b/src/layer/arm/pooling_arm_asimdhp.cpp
@@ -18,6 +18,7 @@
 
 #if __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #endif // __ARM_NEON
 
 namespace ncnn {
@@ -611,7 +612,12 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt
                                 }
                             }
 
+#if _MSC_VER
+                            float16x4_t _inv_area0 = vcvt_f16_f32(vdupq_n_f32(1.f / area));
+                            float16x8_t _inv_area = vcombine_f16(_inv_area0, _inv_area0);
+#else
                             float16x8_t _inv_area = vdupq_n_f16((__fp16)(1.f / area));
+#endif
                             float16x8_t _avg = vmulq_f16(_sum, _inv_area);
                             vst1q_f16(outptr + j * 8, _avg);
                         }
@@ -666,7 +672,11 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt
                                 }
                             }
 
+#if _MSC_VER
+                            float16x4_t _inv_area = vcvt_f16_f32(vdupq_n_f32(1.f / area));
+#else
                             float16x4_t _inv_area = vdup_n_f16((__fp16)(1.f / area));
+#endif
                             float16x4_t _avg = vmul_f16(_sum, _inv_area);
                             vst1_f16(outptr + j * 4, _avg);
                         }
@@ -721,7 +731,7 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt
                                 }
                             }
 
-                            outptr[j] = sum / area;
+                            outptr[j] = sum / (__fp16)area;
                         }
 
                         outptr += outw;
@@ -740,7 +750,12 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt
                     const Mat m = bottom_blob_bordered.channel(q);
                     __fp16* outptr = top_blob.channel(q);
 
+#if _MSC_VER
+                    float16x4_t _inv_maxk0 = vcvt_f16_f32(vdupq_n_f32(1.f / maxk));
+                    float16x8_t _inv_maxk = vcombine_f16(_inv_maxk0, _inv_maxk0);
+#else
                     float16x8_t _inv_maxk = vdupq_n_f16((__fp16)(1.f / maxk));
+#endif
 
                     for (int i = 0; i < outh; i++)
                     {
@@ -773,7 +788,11 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt
                     const Mat m = bottom_blob_bordered.channel(q);
                     __fp16* outptr = top_blob.channel(q);
 
+#if _MSC_VER
+                    float16x4_t _inv_maxk = vcvt_f16_f32(vdupq_n_f32(1.f / maxk));
+#else
                     float16x4_t _inv_maxk = vdup_n_f16((__fp16)(1.f / maxk));
+#endif
 
                     for (int i = 0; i < outh; i++)
                     {
@@ -820,7 +839,7 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt
                                 sum += val;
                             }
 
-                            outptr[j] = sum / maxk;
+                            outptr[j] = sum / (__fp16)maxk;
                         }
 
                         outptr += outw;
diff --git a/src/layer/arm/prelu_arm_asimdhp.cpp b/src/layer/arm/prelu_arm_asimdhp.cpp
index a87526b7cc3..0d22a37c54c 100644
--- a/src/layer/arm/prelu_arm_asimdhp.cpp
+++ b/src/layer/arm/prelu_arm_asimdhp.cpp
@@ -16,6 +16,7 @@
 
 #if __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #endif // __ARM_NEON
 
 namespace ncnn {
@@ -475,7 +476,11 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c
             const float slope = num_slope > 1 ? slope_data[i] : slope_data[0];
 
             float16x4_t _zero = vdup_n_f16(0.f);
+#if _MSC_VER
+            float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(slope));
+#else
             float16x4_t _slope = vdup_n_f16((__fp16)slope);
+#endif
 
             int j = 0;
             for (; j + 3 < w; j += 4)
@@ -514,7 +519,11 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c
             const float slope = num_slope > 1 ? slope_data[q] : slope_data[0];
 
             float16x4_t _zero = vdup_n_f16(0.f);
+#if _MSC_VER
+            float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(slope));
+#else
             float16x4_t _slope = vdup_n_f16((__fp16)slope);
+#endif
 
             int j = 0;
             for (; j + 3 < size; j += 4)
diff --git a/src/layer/arm/relu_arm_asimdhp.cpp b/src/layer/arm/relu_arm_asimdhp.cpp
index 04f2b7f3234..55e75d88596 100644
--- a/src/layer/arm/relu_arm_asimdhp.cpp
+++ b/src/layer/arm/relu_arm_asimdhp.cpp
@@ -16,6 +16,7 @@
 
 #if __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #endif // __ARM_NEON
 
 namespace ncnn {
diff --git a/src/layer/arm/sigmoid_arm_asimdhp.cpp b/src/layer/arm/sigmoid_arm_asimdhp.cpp
index 65c32ee3e67..5d777f2769b 100644
--- a/src/layer/arm/sigmoid_arm_asimdhp.cpp
+++ b/src/layer/arm/sigmoid_arm_asimdhp.cpp
@@ -16,6 +16,7 @@
 
 #if __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #include "neon_mathfun.h"
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include "neon_mathfun_fp16s.h"
@@ -110,10 +111,10 @@ int Sigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
             float16x8_t _p1 = vld1q_f16(ptr + 8);
             float16x8_t _p2 = vld1q_f16(ptr + 16);
             float16x8_t _p3 = vld1q_f16(ptr + 24);
-            _p0 = sigmoid_ps(_p0);
-            _p1 = sigmoid_ps(_p1);
-            _p2 = sigmoid_ps(_p2);
-            _p3 = sigmoid_ps(_p3);
+            _p0 = sigmoid_ps_f16(_p0);
+            _p1 = sigmoid_ps_f16(_p1);
+            _p2 = sigmoid_ps_f16(_p2);
+            _p3 = sigmoid_ps_f16(_p3);
             vst1q_f16(ptr, _p0);
             vst1q_f16(ptr + 8, _p1);
             vst1q_f16(ptr + 16, _p2);
@@ -124,8 +125,8 @@ int Sigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
         {
             float16x8_t _p0 = vld1q_f16(ptr);
             float16x8_t _p1 = vld1q_f16(ptr + 8);
-            _p0 = sigmoid_ps(_p0);
-            _p1 = sigmoid_ps(_p1);
+            _p0 = sigmoid_ps_f16(_p0);
+            _p1 = sigmoid_ps_f16(_p1);
             vst1q_f16(ptr, _p0);
             vst1q_f16(ptr + 8, _p1);
             ptr += 16;
@@ -133,14 +134,14 @@ int Sigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
         for (; i + 7 < size; i += 8)
         {
             float16x8_t _p = vld1q_f16(ptr);
-            _p = sigmoid_ps(_p);
+            _p = sigmoid_ps_f16(_p);
             vst1q_f16(ptr, _p);
             ptr += 8;
         }
         for (; i + 3 < size; i += 4)
         {
             float16x4_t _p = vld1_f16(ptr);
-            _p = sigmoid_ps(_p);
+            _p = sigmoid_ps_f16(_p);
             vst1_f16(ptr, _p);
             ptr += 4;
         }
diff --git a/src/layer/arm/softmax_arm_asimdhp.cpp b/src/layer/arm/softmax_arm_asimdhp.cpp
index d8efaf4c3b9..844e32ce908 100644
--- a/src/layer/arm/softmax_arm_asimdhp.cpp
+++ b/src/layer/arm/softmax_arm_asimdhp.cpp
@@ -18,6 +18,7 @@
 
 #if __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #include "neon_mathfun_fp16s.h"
 #endif // __ARM_NEON
 
@@ -72,7 +73,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
             for (; i + 7 < size; i += 8)
             {
                 float16x8_t _p = vld1q_f16(ptr);
-                _p = exp_ps(vsubq_f16(_p, _max));
+                _p = exp_ps_f16(vsubq_f16(_p, _max));
                 vst1q_f16(ptr, _p);
                 _sum = vaddq_f16(_sum, _p);
                 ptr += 8;
@@ -80,7 +81,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
             for (; i + 3 < size; i += 4)
             {
                 float16x4_t _p = vld1_f16(ptr);
-                _p = exp_ps(vsub_f16(_p, vget_low_f16(_max)));
+                _p = exp_ps_f16(vsub_f16(_p, vget_low_f16(_max)));
                 vst1_f16(ptr, _p);
                 _sum = vcombine_f16(vadd_f16(vget_low_f16(_sum), _p), vget_high_f16(_sum));
                 ptr += 4;
@@ -243,10 +244,10 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     float16x8_t _p3 = vld1q_f16(ptr + 24);
                     float16x4_t _max = vld1_f16(maxptr);
                     float16x4_t _sum = vld1_f16(sumptr);
-                    _p0 = exp_ps(vsubq_f16(_p0, vdupq_lane_f16(_max, 0)));
-                    _p1 = exp_ps(vsubq_f16(_p1, vdupq_lane_f16(_max, 1)));
-                    _p2 = exp_ps(vsubq_f16(_p2, vdupq_lane_f16(_max, 2)));
-                    _p3 = exp_ps(vsubq_f16(_p3, vdupq_lane_f16(_max, 3)));
+                    _p0 = exp_ps_f16(vsubq_f16(_p0, vdupq_lane_f16(_max, 0)));
+                    _p1 = exp_ps_f16(vsubq_f16(_p1, vdupq_lane_f16(_max, 1)));
+                    _p2 = exp_ps_f16(vsubq_f16(_p2, vdupq_lane_f16(_max, 2)));
+                    _p3 = exp_ps_f16(vsubq_f16(_p3, vdupq_lane_f16(_max, 3)));
                     vst1q_f16(ptr, _p0);
                     vst1q_f16(ptr + 8, _p1);
                     vst1q_f16(ptr + 16, _p2);
@@ -264,7 +265,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                 {
                     float16x8_t _p = vld1q_f16(ptr);
                     float16x8_t _max = vdupq_n_f16(*maxptr);
-                    _p = exp_ps(vsubq_f16(_p, _max));
+                    _p = exp_ps_f16(vsubq_f16(_p, _max));
                     vst1q_f16(ptr, _p);
                     float16x4_t _sum2 = vadd_f16(vget_low_f16(_p), vget_high_f16(_p));
                     float16x4_t _ss2 = vpadd_f16(_sum2, _sum2);
@@ -286,8 +287,8 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     float16x4_t _sum = vld1_f16(sumptr);
                     float16x8_t _max0 = vcombine_f16(vdup_lane_f16(_max, 0), vdup_lane_f16(_max, 1));
                     float16x8_t _max1 = vcombine_f16(vdup_lane_f16(_max, 2), vdup_lane_f16(_max, 3));
-                    _p0 = exp_ps(vsubq_f16(_p0, _max0));
-                    _p1 = exp_ps(vsubq_f16(_p1, _max1));
+                    _p0 = exp_ps_f16(vsubq_f16(_p0, _max0));
+                    _p1 = exp_ps_f16(vsubq_f16(_p1, _max1));
                     vst1q_f16(ptr, _p0);
                     vst1q_f16(ptr + 8, _p1);
                     float16x8_t _ss2 = vpaddq_f16(_p0, _p1);
@@ -301,7 +302,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                 {
                     float16x4_t _p = vld1_f16(ptr);
                     float16x4_t _max = vdup_n_f16(*maxptr);
-                    _p = exp_ps(vsub_f16(_p, _max));
+                    _p = exp_ps_f16(vsub_f16(_p, _max));
                     vst1_f16(ptr, _p);
                     float16x4_t _ss2 = vpadd_f16(_p, _p);
                     __fp16 sum0 = vget_lane_f16(_ss2, 0) + vget_lane_f16(_ss2, 1);
@@ -319,7 +320,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     float16x8_t _p = vld1q_f16(ptr);
                     float16x8_t _max = vld1q_f16(maxptr);
                     float16x8_t _sum = vld1q_f16(sumptr);
-                    _p = exp_ps(vsubq_f16(_p, _max));
+                    _p = exp_ps_f16(vsubq_f16(_p, _max));
                     _sum = vaddq_f16(_sum, _p);
                     vst1q_f16(ptr, _p);
                     vst1q_f16(sumptr, _sum);
@@ -332,7 +333,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     float16x4_t _p = vld1_f16(ptr);
                     float16x4_t _max = vld1_f16(maxptr);
                     float16x4_t _sum = vld1_f16(sumptr);
-                    _p = exp_ps(vsub_f16(_p, _max));
+                    _p = exp_ps_f16(vsub_f16(_p, _max));
                     _sum = vadd_f16(_sum, _p);
                     vst1_f16(ptr, _p);
                     vst1_f16(sumptr, _sum);
@@ -465,7 +466,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                 for (; j + 7 < size; j += 8)
                 {
                     float16x8_t _p = vld1q_f16(ptr);
-                    _p = exp_ps(vsubq_f16(_p, _max));
+                    _p = exp_ps_f16(vsubq_f16(_p, _max));
                     vst1q_f16(ptr, _p);
                     _sum = vaddq_f16(_sum, _p);
                     ptr += 8;
@@ -473,7 +474,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                 for (; j + 3 < size; j += 4)
                 {
                     float16x4_t _p = vld1_f16(ptr);
-                    _p = exp_ps(vsub_f16(_p, vget_low_f16(_max)));
+                    _p = exp_ps_f16(vsub_f16(_p, vget_low_f16(_max)));
                     vst1_f16(ptr, _p);
                     _sum = vcombine_f16(vadd_f16(vget_low_f16(_sum), _p), vget_high_f16(_sum));
                     ptr += 4;
@@ -643,10 +644,10 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     float16x8_t _p2 = vld1q_f16(ptr + 16);
                     float16x8_t _p3 = vld1q_f16(ptr + 24);
                     float16x4_t _max = vld1_f16(maxptr);
-                    _p0 = exp_ps(vsubq_f16(_p0, vdupq_lane_f16(_max, 0)));
-                    _p1 = exp_ps(vsubq_f16(_p1, vdupq_lane_f16(_max, 1)));
-                    _p2 = exp_ps(vsubq_f16(_p2, vdupq_lane_f16(_max, 2)));
-                    _p3 = exp_ps(vsubq_f16(_p3, vdupq_lane_f16(_max, 3)));
+                    _p0 = exp_ps_f16(vsubq_f16(_p0, vdupq_lane_f16(_max, 0)));
+                    _p1 = exp_ps_f16(vsubq_f16(_p1, vdupq_lane_f16(_max, 1)));
+                    _p2 = exp_ps_f16(vsubq_f16(_p2, vdupq_lane_f16(_max, 2)));
+                    _p3 = exp_ps_f16(vsubq_f16(_p3, vdupq_lane_f16(_max, 3)));
                     vst1q_f16(ptr, _p0);
                     vst1q_f16(ptr + 8, _p1);
                     vst1q_f16(ptr + 16, _p2);
@@ -658,7 +659,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                 {
                     float16x8_t _p = vld1q_f16(ptr);
                     float16x8_t _max = vdupq_n_f16(*maxptr);
-                    _p = exp_ps(vsubq_f16(_p, _max));
+                    _p = exp_ps_f16(vsubq_f16(_p, _max));
                     vst1q_f16(ptr, _p);
                     ptr += 8;
                     maxptr++;
@@ -674,8 +675,8 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     float16x4_t _max = vld1_f16(maxptr);
                     float16x8_t _max0 = vcombine_f16(vdup_lane_f16(_max, 0), vdup_lane_f16(_max, 1));
                     float16x8_t _max1 = vcombine_f16(vdup_lane_f16(_max, 2), vdup_lane_f16(_max, 3));
-                    _p0 = exp_ps(vsubq_f16(_p0, _max0));
-                    _p1 = exp_ps(vsubq_f16(_p1, _max1));
+                    _p0 = exp_ps_f16(vsubq_f16(_p0, _max0));
+                    _p1 = exp_ps_f16(vsubq_f16(_p1, _max1));
                     vst1q_f16(ptr, _p0);
                     vst1q_f16(ptr + 8, _p1);
                     ptr += 16;
@@ -685,7 +686,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                 {
                     float16x4_t _p = vld1_f16(ptr);
                     float16x4_t _max = vdup_n_f16(*maxptr);
-                    _p = exp_ps(vsub_f16(_p, _max));
+                    _p = exp_ps_f16(vsub_f16(_p, _max));
                     vst1_f16(ptr, _p);
                     ptr += 4;
                     maxptr++;
@@ -698,7 +699,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                 {
                     float16x8_t _p = vld1q_f16(ptr);
                     float16x8_t _max = vld1q_f16(maxptr);
-                    _p = exp_ps(vsubq_f16(_p, _max));
+                    _p = exp_ps_f16(vsubq_f16(_p, _max));
                     vst1q_f16(ptr, _p);
                     ptr += 8;
                     maxptr += 8;
@@ -707,7 +708,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                 {
                     float16x4_t _p = vld1_f16(ptr);
                     float16x4_t _max = vld1_f16(maxptr);
-                    _p = exp_ps(vsub_f16(_p, _max));
+                    _p = exp_ps_f16(vsub_f16(_p, _max));
                     vst1_f16(ptr, _p);
                     ptr += 4;
                     maxptr += 4;
@@ -943,7 +944,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     float16x8_t _p = vld1q_f16(ptr);
                     float16x8_t _max = vld1q_f16(maxptr);
                     float16x8_t _sum = vld1q_f16(sumptr);
-                    _p = exp_ps(vsubq_f16(_p, _max));
+                    _p = exp_ps_f16(vsubq_f16(_p, _max));
                     _sum = vaddq_f16(_sum, _p);
                     vst1q_f16(ptr, _p);
                     vst1q_f16(sumptr, _sum);
@@ -956,7 +957,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     float16x4_t _p = vld1_f16(ptr);
                     float16x4_t _max = vld1_f16(maxptr);
                     float16x4_t _sum = vld1_f16(sumptr);
-                    _p = exp_ps(vsub_f16(_p, _max));
+                    _p = exp_ps_f16(vsub_f16(_p, _max));
                     _sum = vadd_f16(_sum, _p);
                     vst1_f16(ptr, _p);
                     vst1_f16(sumptr, _sum);
@@ -1070,7 +1071,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     for (; j + 7 < size; j += 8)
                     {
                         float16x8_t _p = vld1q_f16(ptr);
-                        _p = exp_ps(vsubq_f16(_p, _max));
+                        _p = exp_ps_f16(vsubq_f16(_p, _max));
                         vst1q_f16(ptr, _p);
                         _sum = vaddq_f16(_sum, _p);
                         ptr += 8;
@@ -1078,7 +1079,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     for (; j + 3 < size; j += 4)
                     {
                         float16x4_t _p = vld1_f16(ptr);
-                        _p = exp_ps(vsub_f16(_p, vget_low_f16(_max)));
+                        _p = exp_ps_f16(vsub_f16(_p, vget_low_f16(_max)));
                         vst1_f16(ptr, _p);
                         _sum = vcombine_f16(vadd_f16(vget_low_f16(_sum), _p), vget_high_f16(_sum));
                         ptr += 4;
diff --git a/src/layer/arm/swish_arm_asimdhp.cpp b/src/layer/arm/swish_arm_asimdhp.cpp
index 4aee8a898c4..6cda2d0dd72 100644
--- a/src/layer/arm/swish_arm_asimdhp.cpp
+++ b/src/layer/arm/swish_arm_asimdhp.cpp
@@ -16,6 +16,7 @@
 
 #if __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #include "neon_mathfun.h"
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include "neon_mathfun_fp16s.h"
@@ -128,8 +129,8 @@ int Swish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c
         {
             float16x8_t _p0 = vld1q_f16(ptr);
             float16x8_t _p1 = vld1q_f16(ptr + 8);
-            _p0 = vdivq_f16(_p0, vaddq_f16(_one, exp_ps(vnegq_f16(_p0))));
-            _p1 = vdivq_f16(_p1, vaddq_f16(_one, exp_ps(vnegq_f16(_p1))));
+            _p0 = vdivq_f16(_p0, vaddq_f16(_one, exp_ps_f16(vnegq_f16(_p0))));
+            _p1 = vdivq_f16(_p1, vaddq_f16(_one, exp_ps_f16(vnegq_f16(_p1))));
             vst1q_f16(ptr, _p0);
             vst1q_f16(ptr + 8, _p1);
             ptr += 16;
@@ -137,21 +138,21 @@ int Swish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c
         for (; i + 7 < size; i += 8)
         {
             float16x8_t _p = vld1q_f16(ptr);
-            _p = vdivq_f16(_p, vaddq_f16(_one, exp_ps(vnegq_f16(_p))));
+            _p = vdivq_f16(_p, vaddq_f16(_one, exp_ps_f16(vnegq_f16(_p))));
             vst1q_f16(ptr, _p);
             ptr += 8;
         }
         for (; i + 3 < size; i += 4)
         {
             float16x4_t _p = vld1_f16(ptr);
-            _p = vdiv_f16(_p, vadd_f16(vget_low_f16(_one), exp_ps(vneg_f16(_p))));
+            _p = vdiv_f16(_p, vadd_f16(vget_low_f16(_one), exp_ps_f16(vneg_f16(_p))));
             vst1_f16(ptr, _p);
             ptr += 4;
         }
         for (; i < size; i++)
         {
             __fp16 v = *ptr;
-            v = v / ((__fp16)1.f + expf(-v));
+            v = v / (__fp16)(1.f + expf(-v));
             *ptr = v;
 
             ptr++;
diff --git a/src/layer/arm/tanh_arm_asimdhp.cpp b/src/layer/arm/tanh_arm_asimdhp.cpp
index 10f3303a1ce..42194a945af 100644
--- a/src/layer/arm/tanh_arm_asimdhp.cpp
+++ b/src/layer/arm/tanh_arm_asimdhp.cpp
@@ -16,6 +16,7 @@
 
 #if __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #include "neon_mathfun.h"
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include "neon_mathfun_fp16s.h"
@@ -99,7 +100,7 @@ int TanH_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co
             for (int i = 0; i < size; i++)
             {
                 float16x8_t _p = vld1q_f16(ptr);
-                _p = tanh_ps(_p);
+                _p = tanh_ps_f16(_p);
                 vst1q_f16(ptr, _p);
 
                 ptr += 8;
@@ -119,7 +120,7 @@ int TanH_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co
             for (int i = 0; i < size; i++)
             {
                 float16x4_t _p = vld1_f16(ptr);
-                _p = tanh_ps(_p);
+                _p = tanh_ps_f16(_p);
                 vst1_f16(ptr, _p);
 
                 ptr += 4;
@@ -138,7 +139,7 @@ int TanH_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) co
         for (; i + 3 < size; i += 4)
         {
             float16x4_t _p = vld1_f16(ptr);
-            _p = tanh_ps(_p);
+            _p = tanh_ps_f16(_p);
             vst1_f16(ptr, _p);
 
             ptr += 4;
diff --git a/src/layer/arm/unaryop_arm_asimdhp.cpp b/src/layer/arm/unaryop_arm_asimdhp.cpp
index ac64fc708f9..f42848226c0 100644
--- a/src/layer/arm/unaryop_arm_asimdhp.cpp
+++ b/src/layer/arm/unaryop_arm_asimdhp.cpp
@@ -19,6 +19,7 @@
 
 #if __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include "neon_mathfun_fp16s.h"
 #endif
@@ -181,7 +182,7 @@ struct unary_op_rsqrt_fp16s
 {
     __fp16 func(const __fp16& x) const
     {
-        return (__fp16)1.f / sqrtf(x);
+        return (__fp16)1.f / (__fp16)sqrtf(x);
     }
     float16x4_t func_pack4(const float16x4_t& x) const
     {
@@ -207,11 +208,11 @@ struct unary_op_exp_fp16s
     }
     float16x4_t func_pack4(const float16x4_t& x) const
     {
-        return exp_ps(x);
+        return exp_ps_f16(x);
     }
     float16x8_t func_pack8(const float16x8_t& x) const
     {
-        return exp_ps(x);
+        return exp_ps_f16(x);
     }
 };
 
@@ -223,11 +224,11 @@ struct unary_op_log_fp16s
     }
     float16x4_t func_pack4(const float16x4_t& x) const
     {
-        return log_ps(x);
+        return log_ps_f16(x);
     }
     float16x8_t func_pack8(const float16x8_t& x) const
     {
-        return log_ps(x);
+        return log_ps_f16(x);
     }
 };
 
@@ -239,11 +240,11 @@ struct unary_op_sin_fp16s
     }
     float16x4_t func_pack4(const float16x4_t& x) const
     {
-        return sin_ps(x);
+        return sin_ps_f16(x);
     }
     float16x8_t func_pack8(const float16x8_t& x) const
     {
-        return sin_ps(x);
+        return sin_ps_f16(x);
     }
 };
 
@@ -255,11 +256,11 @@ struct unary_op_cos_fp16s
     }
     float16x4_t func_pack4(const float16x4_t& x) const
     {
-        return cos_ps(x);
+        return cos_ps_f16(x);
     }
     float16x8_t func_pack8(const float16x8_t& x) const
     {
-        return cos_ps(x);
+        return cos_ps_f16(x);
     }
 };
 
@@ -429,11 +430,11 @@ struct unary_op_tanh_fp16s
     }
     float16x4_t func_pack4(const float16x4_t& x) const
     {
-        return tanh_ps(x);
+        return tanh_ps_f16(x);
     }
     float16x8_t func_pack8(const float16x8_t& x) const
     {
-        return tanh_ps(x);
+        return tanh_ps_f16(x);
     }
 };
 
@@ -445,11 +446,11 @@ struct unary_op_log10_fp16s
     }
     float16x4_t func_pack4(const float16x4_t& x) const
     {
-        return vmul_f16(log_ps(x), vdup_n_f16(0.434294481903));
+        return vmul_f16(log_ps_f16(x), vdup_n_f16(0.434294481903));
     }
     float16x8_t func_pack8(const float16x8_t& x) const
     {
-        return vmulq_f16(log_ps(x), vdupq_n_f16(0.434294481903));
+        return vmulq_f16(log_ps_f16(x), vdupq_n_f16(0.434294481903));
     }
 };