fix build with msvc arm64 asimdhp

Tencent · Nov 27, 2023 · 19a7a1a · 19a7a1a
1 parent 3785921
commit 19a7a1a
Show file tree

Hide file tree

Showing 34 changed files with 548 additions and 309 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -183,35 +183,35 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm")
             set(CMAKE_REQUIRED_FLAGS "/arch:armv8.0")
             check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)
 
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
-            # check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16x8_t _s, _a, _b; _s = vfmaq_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
-            # check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
-            # check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4")
-            # check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4")
-            # check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
-            # check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat16_t _s, _a, _b; svbool_t bp; _s = svmla_f16_z(bp, _s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
-            # check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint16_t _s; svint8_t _a, _b; _s = svmlslb_s16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE2)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
-            # check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s; svbfloat16_t _a, _b; _s = svbfmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
-            # check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint32_t _s; svint8_t _a, _b; _s = svmmla_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)
-            #
-            # set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
-            # check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s, _a, _b; _s = svmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
+            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16x8_t _s, _a, _b; _s = vfmaq_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
+            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
+            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4")
+            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4")
+            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
+            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat16_t _s, _a, _b; svbool_t bp; _s = svmla_f16_z(bp, _s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
+            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint16_t _s; svint8_t _a, _b; _s = svmlslb_s16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE2)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
+            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s; svbfloat16_t _a, _b; _s = svbfmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
+            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint32_t _s; svint8_t _a, _b; _s = svmmla_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)
+
+            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
+            check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s, _a, _b; _s = svmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)
 
             unset(CMAKE_REQUIRED_FLAGS)
         else()

diff --git a/src/layer/arm/arm_activation.h b/src/layer/arm/arm_activation.h
@@ -68,9 +68,10 @@ static inline float32x4_t activation_ps(float32x4_t _v, int activation_type, con
 }
 
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#include "arm_usability.h"
 #include "neon_mathfun_fp16s.h"
 
-static inline __fp16 activation_ss(__fp16 v, int activation_type, const ncnn::Mat& activation_params)
+static inline __fp16 activation_ss_f16(__fp16 v, int activation_type, const ncnn::Mat& activation_params)
 {
     if (activation_type == 1)
     {
@@ -92,11 +93,11 @@ static inline __fp16 activation_ss(__fp16 v, int activation_type, const ncnn::Ma
     }
     else if (activation_type == 4)
     {
-        v = (__fp16)1.f / ((__fp16)1.f + expf(-v));
+        v = (__fp16)1.f / ((__fp16)1.f + (__fp16)expf(-v));
     }
     else if (activation_type == 5)
     {
-        v = v * tanhf(logf(expf(v) + (__fp16)1.f));
+        v = v * (__fp16)tanhf(logf(expf((float)v) + 1.f));
     }
     else if (activation_type == 6)
     {
@@ -115,7 +116,7 @@ static inline __fp16 activation_ss(__fp16 v, int activation_type, const ncnn::Ma
     return v;
 }
 
-static inline float16x4_t activation_ps(float16x4_t _v, int activation_type, const ncnn::Mat& activation_params)
+static inline float16x4_t activation_ps_f16(float16x4_t _v, int activation_type, const ncnn::Mat& activation_params)
 {
     if (activation_type == 1)
     {
@@ -125,7 +126,11 @@ static inline float16x4_t activation_ps(float16x4_t _v, int activation_type, con
     else if (activation_type == 2)
     {
         const float16x4_t _zero = vdup_n_f16(0.f);
+#if _MSC_VER
+        const float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(activation_params[0]));
+#else
         const float16x4_t _slope = vdup_n_f16((__fp16)activation_params[0]);
+#endif
         const uint16x4_t _lemask = vcle_f16(_v, _zero);
         float16x4_t _ps = vmul_f16(_v, _slope);
         _v = vbsl_f16(_lemask, _ps, _v);
@@ -139,11 +144,11 @@ static inline float16x4_t activation_ps(float16x4_t _v, int activation_type, con
     }
     else if (activation_type == 4)
     {
-        _v = sigmoid_ps(_v);
+        _v = sigmoid_ps_f16(_v);
     }
     else if (activation_type == 5)
     {
-        _v = vmul_f16(_v, tanh_ps(log_ps(vadd_f16(exp_ps(_v), vdup_n_f16(1.f)))));
+        _v = vmul_f16(_v, tanh_ps_f16(log_ps_f16(vadd_f16(exp_ps_f16(_v), vdup_n_f16(1.f)))));
     }
     else if (activation_type == 6)
     {
@@ -161,7 +166,7 @@ static inline float16x4_t activation_ps(float16x4_t _v, int activation_type, con
     return _v;
 }
 
-static inline float16x8_t activation_ps(float16x8_t _v, int activation_type, const ncnn::Mat& activation_params)
+static inline float16x8_t activation_ps_f16(float16x8_t _v, int activation_type, const ncnn::Mat& activation_params)
 {
     if (activation_type == 1)
     {
@@ -171,7 +176,12 @@ static inline float16x8_t activation_ps(float16x8_t _v, int activation_type, con
     else if (activation_type == 2)
     {
         const float16x8_t _zero = vdupq_n_f16(0.f);
+#if _MSC_VER
+        const float16x4_t _slope0 = vcvt_f16_f32(vdupq_n_f32(activation_params[0]));
+        const float16x8_t _slope = vcombine_f16(_slope0, _slope0);
+#else
         const float16x8_t _slope = vdupq_n_f16((__fp16)activation_params[0]);
+#endif
         const uint16x8_t _lemask = vcleq_f16(_v, _zero);
         float16x8_t _ps = vmulq_f16(_v, _slope);
         _v = vbslq_f16(_lemask, _ps, _v);
@@ -185,11 +195,11 @@ static inline float16x8_t activation_ps(float16x8_t _v, int activation_type, con
     }
     else if (activation_type == 4)
     {
-        _v = sigmoid_ps(_v);
+        _v = sigmoid_ps_f16(_v);
     }
     else if (activation_type == 5)
     {
-        _v = vmulq_f16(_v, tanh_ps(log_ps(vaddq_f16(exp_ps(_v), vdupq_n_f16(1.f)))));
+        _v = vmulq_f16(_v, tanh_ps_f16(log_ps_f16(vaddq_f16(exp_ps_f16(_v), vdupq_n_f16(1.f)))));
     }
     else if (activation_type == 6)
     {

diff --git a/src/layer/arm/arm_usability.h b/src/layer/arm/arm_usability.h
@@ -123,6 +123,115 @@ static inline int8x8_t float2int8leakyrelu(float32x4_t _vlow, float32x4_t _vhigh
 }
 
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef _MSC_VER
+struct __fp16
+{
+    __fp16()
+    {
+        _u16 = 0;
+    }
+
+    __fp16(float f32)
+    {
+        _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
+    }
+
+    __fp16(__n16 n16)
+    {
+        _u16 = n16.n16_u16[0];
+    }
+
+    operator const float() const
+    {
+        return vgetq_lane_f32(vcvt_f32_f16(vreinterpretq_f16_u16(vdup_n_u16(_u16))), 0);
+    }
+
+    __fp16& operator+=(const __fp16& b)
+    {
+        float a = (float)*this;
+        float f32 = (a + (float)b);
+        _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
+        return *this;
+    }
+
+    __fp16& operator-=(const __fp16& b)
+    {
+        float a = (float)*this;
+        float f32 = (a - (float)b);
+        _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
+        return *this;
+    }
+
+    __fp16& operator*=(const __fp16& b)
+    {
+        float a = (float)*this;
+        float f32 = (a * (float)b);
+        _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
+        return *this;
+    }
+
+    __fp16& operator/=(const __fp16& b)
+    {
+        float a = (float)*this;
+        float f32 = (a / (float)b);
+        _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
+        return *this;
+    }
+
+    unsigned short _u16;
+};
+
+static inline __fp16 operator-(const __fp16& a) { return __fp16(-(float)a); }
+static inline __fp16 operator+(const __fp16& a, const __fp16& b) { return __fp16((float)a + (float)b); }
+static inline __fp16 operator-(const __fp16& a, const __fp16& b) { return __fp16((float)a - (float)b); }
+static inline __fp16 operator*(const __fp16& a, const __fp16& b) { return __fp16((float)a * (float)b); }
+static inline __fp16 operator/(const __fp16& a, const __fp16& b) { return __fp16((float)a / (float)b); }
+
+static inline float16x4_t vdup_n_f16(const __fp16& f16)
+{
+    return vreinterpret_f16_u16(vdup_n_u16(f16._u16));
+}
+
+static inline float16x8_t vdupq_n_f16(const __fp16& f16)
+{
+    return vreinterpretq_f16_u16(vdupq_n_u16(f16._u16));
+}
+
+static inline __fp16 vmaxv_f16(float16x4_t a)
+{
+    return __fp16(vmaxvq_f32(vcvt_f32_f16(a)));
+}
+
+static inline __fp16 vmaxvq_f16(float16x8_t a)
+{
+    return __fp16(vmaxvq_f32(vcvt_f32_f16(vget_low_f16(a))) + vmaxvq_f32(vcvt_f32_f16(vget_high_f16(a))));
+}
+
+#define vld1q_f16 vld1q_u16
+#define vst1q_f16 vst1q_u16
+
+#define vld2_f16 vld2_u16
+#define vst2_f16 vst2_u16
+
+#define vld2q_f16 vld2q_u16
+#define vst2q_f16 vst2q_u16
+
+#define vld4_f16 vld4_u16
+#define vst4_f16 vst4_u16
+
+#define vld4q_f16 vld4q_u16
+#define vst4q_f16 vst4q_u16
+
+#define vld1q_dup_f16 vld1q_dup_u16
+
+#define vset_lane_f16(x, v, i)  vset_lane_u16(x._u16, (uint16x4_t)v, i)
+#define vsetq_lane_f16(x, v, i) vsetq_lane_u16(x._u16, (uint16x8_t)v, i)
+
+#define vfma_n_f16(va, vb, x) vfma_f16(va, vb, vdup_n_f16(x))
+#define vfmaq_n_f16(va, vb, x) vfmaq_f16(va, vb, vdupq_n_f16(x))
+
+#endif
+
 static inline signed char float2int8(__fp16 v)
 {
     int int32 = round(v);

diff --git a/src/layer/arm/batchnorm_arm_asimdhp.cpp b/src/layer/arm/batchnorm_arm_asimdhp.cpp
@@ -18,6 +18,8 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON
 
+#include "arm_usability.h"
+
 namespace ncnn {
 
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -109,7 +111,7 @@ int BatchNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int i = 0; i < w; i++)
         {
-            ptr[i] = b_data[i] * ptr[i] + a_data[i];
+            ptr[i] = b_data[i] * (float)ptr[i] + a_data[i];
         }
     }
 
@@ -140,7 +142,7 @@ int BatchNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt
             }
             for (; j < w; j++)
             {
-                *ptr = b * *ptr + a;
+                *ptr = b * (float)*ptr + a;
 
                 ptr++;
             }
@@ -177,7 +179,7 @@ int BatchNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt
             }
             for (; j < size; j++)
             {
-                *ptr = b * *ptr + a;
+                *ptr = b * (float)*ptr + a;
 
                 ptr++;
             }
@@ -367,7 +369,11 @@ int BatchNorm_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
             __fp16 b = (__fp16)b_data[i];
 
             float16x4_t _a = vdup_n_f16(a);
+#if _MSC_VER
+            float16x4_t _b = vcvt_f16_f32(vdupq_n_f32(b_data[i]));
+#else
             float16x4_t _b = vdup_n_f16(b);
+#endif
 
             int j = 0;
             for (; j + 3 < w; j += 4)
@@ -404,7 +410,11 @@ int BatchNorm_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
             __fp16 b = (__fp16)b_data[q];
 
             float16x4_t _a = vdup_n_f16(a);
+#if _MSC_VER
+            float16x4_t _b = vcvt_f16_f32(vdupq_n_f32(b_data[q]));
+#else
             float16x4_t _b = vdup_n_f16(b);
+#endif
 
             int j = 0;
             for (; j + 3 < size; j += 4)

diff --git a/src/layer/arm/binaryop_arm_asimdhp.cpp b/src/layer/arm/binaryop_arm_asimdhp.cpp
@@ -17,6 +17,7 @@
 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
+#include "arm_usability.h"
 #endif // __ARM_NEON
 
 namespace ncnn {

diff --git a/src/layer/arm/clip_arm_asimdhp.cpp b/src/layer/arm/clip_arm_asimdhp.cpp
@@ -16,6 +16,7 @@
 
 #ifdef __ARM_NEON
 #include <arm_neon.h>
+#include "arm_usability.h"
 #endif // __ARM_NEON
 
 namespace ncnn {

diff --git a/src/layer/arm/convolution1d_arm.cpp b/src/layer/arm/convolution1d_arm.cpp
@@ -18,8 +18,8 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON
 
-#include "arm_activation.h"
 #include "arm_usability.h"
+#include "arm_activation.h"
 
 #include "cpu.h"
 #include "layer_type.h"