Skip to content

Commit

Permalink
fix build with msvc arm64 asimdhp
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Nov 27, 2023
1 parent 3785921 commit 19a7a1a
Show file tree
Hide file tree
Showing 34 changed files with 548 additions and 309 deletions.
58 changes: 29 additions & 29 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -183,35 +183,35 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm")
set(CMAKE_REQUIRED_FLAGS "/arch:armv8.0")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)

# set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
# check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16x8_t _s, _a, _b; _s = vfmaq_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16)
#
# set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
# check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)
#
# set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
# check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)
#
# set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4")
# check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16)
#
# set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4")
# check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)
#
# set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
# check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat16_t _s, _a, _b; svbool_t bp; _s = svmla_f16_z(bp, _s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE)
#
# set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
# check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint16_t _s; svint8_t _a, _b; _s = svmlslb_s16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE2)
#
# set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
# check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s; svbfloat16_t _a, _b; _s = svbfmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)
#
# set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
# check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint32_t _s; svint8_t _a, _b; _s = svmmla_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)
#
# set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
# check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s, _a, _b; _s = svmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)
set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16x8_t _s, _a, _b; _s = vfmaq_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16)

set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)

set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)

set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16)

set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)

set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat16_t _s, _a, _b; svbool_t bp; _s = svmla_f16_z(bp, _s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE)

set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint16_t _s; svint8_t _a, _b; _s = svmlslb_s16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE2)

set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s; svbfloat16_t _a, _b; _s = svbfmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)

set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svint32_t _s; svint8_t _a, _b; _s = svmmla_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)

set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
check_cxx_source_compiles("#include <arm_sve.h>\nint main() { svfloat32_t _s, _a, _b; _s = svmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)

unset(CMAKE_REQUIRED_FLAGS)
else()
Expand Down
28 changes: 19 additions & 9 deletions src/layer/arm/arm_activation.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,10 @@ static inline float32x4_t activation_ps(float32x4_t _v, int activation_type, con
}

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include "arm_usability.h"
#include "neon_mathfun_fp16s.h"

static inline __fp16 activation_ss(__fp16 v, int activation_type, const ncnn::Mat& activation_params)
static inline __fp16 activation_ss_f16(__fp16 v, int activation_type, const ncnn::Mat& activation_params)
{
if (activation_type == 1)
{
Expand All @@ -92,11 +93,11 @@ static inline __fp16 activation_ss(__fp16 v, int activation_type, const ncnn::Ma
}
else if (activation_type == 4)
{
v = (__fp16)1.f / ((__fp16)1.f + expf(-v));
v = (__fp16)1.f / ((__fp16)1.f + (__fp16)expf(-v));
}
else if (activation_type == 5)
{
v = v * tanhf(logf(expf(v) + (__fp16)1.f));
v = v * (__fp16)tanhf(logf(expf((float)v) + 1.f));
}
else if (activation_type == 6)
{
Expand All @@ -115,7 +116,7 @@ static inline __fp16 activation_ss(__fp16 v, int activation_type, const ncnn::Ma
return v;
}

static inline float16x4_t activation_ps(float16x4_t _v, int activation_type, const ncnn::Mat& activation_params)
static inline float16x4_t activation_ps_f16(float16x4_t _v, int activation_type, const ncnn::Mat& activation_params)
{
if (activation_type == 1)
{
Expand All @@ -125,7 +126,11 @@ static inline float16x4_t activation_ps(float16x4_t _v, int activation_type, con
else if (activation_type == 2)
{
const float16x4_t _zero = vdup_n_f16(0.f);
#if _MSC_VER
const float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(activation_params[0]));
#else
const float16x4_t _slope = vdup_n_f16((__fp16)activation_params[0]);
#endif
const uint16x4_t _lemask = vcle_f16(_v, _zero);
float16x4_t _ps = vmul_f16(_v, _slope);
_v = vbsl_f16(_lemask, _ps, _v);
Expand All @@ -139,11 +144,11 @@ static inline float16x4_t activation_ps(float16x4_t _v, int activation_type, con
}
else if (activation_type == 4)
{
_v = sigmoid_ps(_v);
_v = sigmoid_ps_f16(_v);
}
else if (activation_type == 5)
{
_v = vmul_f16(_v, tanh_ps(log_ps(vadd_f16(exp_ps(_v), vdup_n_f16(1.f)))));
_v = vmul_f16(_v, tanh_ps_f16(log_ps_f16(vadd_f16(exp_ps_f16(_v), vdup_n_f16(1.f)))));
}
else if (activation_type == 6)
{
Expand All @@ -161,7 +166,7 @@ static inline float16x4_t activation_ps(float16x4_t _v, int activation_type, con
return _v;
}

static inline float16x8_t activation_ps(float16x8_t _v, int activation_type, const ncnn::Mat& activation_params)
static inline float16x8_t activation_ps_f16(float16x8_t _v, int activation_type, const ncnn::Mat& activation_params)
{
if (activation_type == 1)
{
Expand All @@ -171,7 +176,12 @@ static inline float16x8_t activation_ps(float16x8_t _v, int activation_type, con
else if (activation_type == 2)
{
const float16x8_t _zero = vdupq_n_f16(0.f);
#if _MSC_VER
const float16x4_t _slope0 = vcvt_f16_f32(vdupq_n_f32(activation_params[0]));
const float16x8_t _slope = vcombine_f16(_slope0, _slope0);
#else
const float16x8_t _slope = vdupq_n_f16((__fp16)activation_params[0]);
#endif
const uint16x8_t _lemask = vcleq_f16(_v, _zero);
float16x8_t _ps = vmulq_f16(_v, _slope);
_v = vbslq_f16(_lemask, _ps, _v);
Expand All @@ -185,11 +195,11 @@ static inline float16x8_t activation_ps(float16x8_t _v, int activation_type, con
}
else if (activation_type == 4)
{
_v = sigmoid_ps(_v);
_v = sigmoid_ps_f16(_v);
}
else if (activation_type == 5)
{
_v = vmulq_f16(_v, tanh_ps(log_ps(vaddq_f16(exp_ps(_v), vdupq_n_f16(1.f)))));
_v = vmulq_f16(_v, tanh_ps_f16(log_ps_f16(vaddq_f16(exp_ps_f16(_v), vdupq_n_f16(1.f)))));
}
else if (activation_type == 6)
{
Expand Down
109 changes: 109 additions & 0 deletions src/layer/arm/arm_usability.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,115 @@ static inline int8x8_t float2int8leakyrelu(float32x4_t _vlow, float32x4_t _vhigh
}

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#ifdef _MSC_VER
struct __fp16
{
__fp16()
{
_u16 = 0;
}

__fp16(float f32)
{
_u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
}

__fp16(__n16 n16)
{
_u16 = n16.n16_u16[0];
}

operator const float() const
{
return vgetq_lane_f32(vcvt_f32_f16(vreinterpretq_f16_u16(vdup_n_u16(_u16))), 0);
}

__fp16& operator+=(const __fp16& b)
{
float a = (float)*this;
float f32 = (a + (float)b);
_u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
return *this;
}

__fp16& operator-=(const __fp16& b)
{
float a = (float)*this;
float f32 = (a - (float)b);
_u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
return *this;
}

__fp16& operator*=(const __fp16& b)
{
float a = (float)*this;
float f32 = (a * (float)b);
_u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
return *this;
}

__fp16& operator/=(const __fp16& b)
{
float a = (float)*this;
float f32 = (a / (float)b);
_u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
return *this;
}

unsigned short _u16;
};

static inline __fp16 operator-(const __fp16& a) { return __fp16(-(float)a); }
static inline __fp16 operator+(const __fp16& a, const __fp16& b) { return __fp16((float)a + (float)b); }
static inline __fp16 operator-(const __fp16& a, const __fp16& b) { return __fp16((float)a - (float)b); }
static inline __fp16 operator*(const __fp16& a, const __fp16& b) { return __fp16((float)a * (float)b); }
static inline __fp16 operator/(const __fp16& a, const __fp16& b) { return __fp16((float)a / (float)b); }

static inline float16x4_t vdup_n_f16(const __fp16& f16)
{
return vreinterpret_f16_u16(vdup_n_u16(f16._u16));
}

static inline float16x8_t vdupq_n_f16(const __fp16& f16)
{
return vreinterpretq_f16_u16(vdupq_n_u16(f16._u16));
}

static inline __fp16 vmaxv_f16(float16x4_t a)
{
return __fp16(vmaxvq_f32(vcvt_f32_f16(a)));
}

static inline __fp16 vmaxvq_f16(float16x8_t a)
{
return __fp16(vmaxvq_f32(vcvt_f32_f16(vget_low_f16(a))) + vmaxvq_f32(vcvt_f32_f16(vget_high_f16(a))));
}

#define vld1q_f16 vld1q_u16
#define vst1q_f16 vst1q_u16

#define vld2_f16 vld2_u16
#define vst2_f16 vst2_u16

#define vld2q_f16 vld2q_u16
#define vst2q_f16 vst2q_u16

#define vld4_f16 vld4_u16
#define vst4_f16 vst4_u16

#define vld4q_f16 vld4q_u16
#define vst4q_f16 vst4q_u16

#define vld1q_dup_f16 vld1q_dup_u16

#define vset_lane_f16(x, v, i) vset_lane_u16(x._u16, (uint16x4_t)v, i)
#define vsetq_lane_f16(x, v, i) vsetq_lane_u16(x._u16, (uint16x8_t)v, i)

#define vfma_n_f16(va, vb, x) vfma_f16(va, vb, vdup_n_f16(x))
#define vfmaq_n_f16(va, vb, x) vfmaq_f16(va, vb, vdupq_n_f16(x))

#endif

static inline signed char float2int8(__fp16 v)
{
int int32 = round(v);
Expand Down
16 changes: 13 additions & 3 deletions src/layer/arm/batchnorm_arm_asimdhp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Expand Down Expand Up @@ -109,7 +111,7 @@ int BatchNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt
#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < w; i++)
{
ptr[i] = b_data[i] * ptr[i] + a_data[i];
ptr[i] = b_data[i] * (float)ptr[i] + a_data[i];
}
}

Expand Down Expand Up @@ -140,7 +142,7 @@ int BatchNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt
}
for (; j < w; j++)
{
*ptr = b * *ptr + a;
*ptr = b * (float)*ptr + a;

ptr++;
}
Expand Down Expand Up @@ -177,7 +179,7 @@ int BatchNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt
}
for (; j < size; j++)
{
*ptr = b * *ptr + a;
*ptr = b * (float)*ptr + a;

ptr++;
}
Expand Down Expand Up @@ -367,7 +369,11 @@ int BatchNorm_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
__fp16 b = (__fp16)b_data[i];

float16x4_t _a = vdup_n_f16(a);
#if _MSC_VER
float16x4_t _b = vcvt_f16_f32(vdupq_n_f32(b_data[i]));
#else
float16x4_t _b = vdup_n_f16(b);
#endif

int j = 0;
for (; j + 3 < w; j += 4)
Expand Down Expand Up @@ -404,7 +410,11 @@ int BatchNorm_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
__fp16 b = (__fp16)b_data[q];

float16x4_t _a = vdup_n_f16(a);
#if _MSC_VER
float16x4_t _b = vcvt_f16_f32(vdupq_n_f32(b_data[q]));
#else
float16x4_t _b = vdup_n_f16(b);
#endif

int j = 0;
for (; j + 3 < size; j += 4)
Expand Down
1 change: 1 addition & 0 deletions src/layer/arm/binaryop_arm_asimdhp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"
#include "arm_usability.h"
#endif // __ARM_NEON

namespace ncnn {
Expand Down
1 change: 1 addition & 0 deletions src/layer/arm/clip_arm_asimdhp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#ifdef __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#endif // __ARM_NEON

namespace ncnn {
Expand Down
2 changes: 1 addition & 1 deletion src/layer/arm/convolution1d_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"
#include "arm_usability.h"
#include "arm_activation.h"

#include "cpu.h"
#include "layer_type.h"
Expand Down
Loading

0 comments on commit 19a7a1a

Please sign in to comment.