diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5c83d56e216..35e46535a3b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -330,7 +330,7 @@ if(NCNN_VULKAN) target_link_libraries(ncnn PRIVATE glslang SPIRV) endif() -if(NCNN_PLATFORM_API AND ANDROID_NDK) +if(NCNN_PLATFORM_API AND ANDROID) target_link_libraries(ncnn PUBLIC android jnigraphics log) endif() diff --git a/src/layer/x86/avx_mathfun.h b/src/layer/x86/avx_mathfun.h index 65c34efc23e..458bacbc971 100644 --- a/src/layer/x86/avx_mathfun.h +++ b/src/layer/x86/avx_mathfun.h @@ -311,7 +311,7 @@ _PS256_CONST(cephes_tanh_p8, 1.18534705686654e-04f); _PS256_CONST(cephes_tanh_p9, 2.26843463243900e-03f); // an approximation of tanh -static inline __m256 tanh256_ps(const __m256 x) +static NCNN_FORCEINLINE __m256 tanh256_ps(__m256 x) { __m256 value = x; value = _mm256_max_ps(*(__m256*)_ps256_tanh_lo, value); diff --git a/src/layer/x86/binaryop_x86.cpp b/src/layer/x86/binaryop_x86.cpp index 14ad9d5f638..2551f4c2438 100644 --- a/src/layer/x86/binaryop_x86.cpp +++ b/src/layer/x86/binaryop_x86.cpp @@ -479,22 +479,22 @@ namespace BinaryOp_x86_functor { struct binary_op_add { - float func(const float& x, const float& y) const + NCNN_FORCEINLINE float func(const float& x, const float& y) const { return x + y; } #if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const { return _mm_add_ps(x, y); } #if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const { return _mm256_add_ps(x, y); } #if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const { return _mm512_add_ps(x, y); } @@ -505,22 +505,22 @@ struct binary_op_add struct binary_op_sub { - float func(const float& x, const float& y) const + NCNN_FORCEINLINE float func(const float& x, const float& y) const { return x - y; } #if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const { return _mm_sub_ps(x, y); } #if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const { return _mm256_sub_ps(x, y); } #if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const { return _mm512_sub_ps(x, y); } @@ -531,22 +531,22 @@ struct binary_op_sub struct binary_op_mul { - float func(const float& x, const float& y) const + NCNN_FORCEINLINE float func(const float& x, const float& y) const { return x * y; } #if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const { return _mm_mul_ps(x, y); } #if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const { return _mm256_mul_ps(x, y); } #if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const { return _mm512_mul_ps(x, y); } @@ -557,22 +557,22 @@ struct binary_op_mul struct binary_op_div { - float func(const float& x, const float& y) const + NCNN_FORCEINLINE float func(const float& x, const float& y) const { return x / y; } #if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const { return _mm_div_ps(x, y); } #if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const { return _mm256_div_ps(x, y); } #if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const { return _mm512_div_ps(x, y); } @@ -583,22 +583,22 @@ struct binary_op_div struct binary_op_max { - float func(const float& x, const float& y) const + NCNN_FORCEINLINE float func(const float& x, const float& y) const { return std::max(x, y); } #if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const { return _mm_max_ps(x, y); } #if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const { return _mm256_max_ps(x, y); } #if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const { return _mm512_max_ps(x, y); } @@ -609,22 +609,22 @@ struct binary_op_max struct binary_op_min { - float func(const float& x, const float& y) const + NCNN_FORCEINLINE float func(const float& x, const float& y) const { return std::min(x, y); } #if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const { return _mm_min_ps(x, y); } #if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const { return _mm256_min_ps(x, y); } #if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const { return _mm512_min_ps(x, y); } @@ -635,22 +635,22 @@ struct binary_op_min struct binary_op_pow { - float func(const float& x, const float& y) const + NCNN_FORCEINLINE float func(const float& x, const float& y) const { return (float)powf(x, y); } #if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const { return pow_ps(x, y); } #if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const { return pow256_ps(x, y); } #if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const { return pow512_ps(x, y); } @@ -661,22 +661,22 @@ struct binary_op_pow struct binary_op_rsub { - float func(const float& x, const float& y) const + NCNN_FORCEINLINE float func(const float& x, const float& y) const { return y - x; } #if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const { return _mm_sub_ps(y, x); } #if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const { return _mm256_sub_ps(y, x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const { return _mm512_sub_ps(y, x); } @@ -687,22 +687,22 @@ struct binary_op_rsub struct binary_op_rdiv { - float func(const float& x, const float& y) const + NCNN_FORCEINLINE float func(const float& x, const float& y) const { return y / x; } #if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const { return _mm_div_ps(y, x); } #if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const { return _mm256_div_ps(y, x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const { return _mm512_div_ps(y, x); } @@ -713,22 +713,22 @@ struct binary_op_rdiv struct binary_op_rpow { - float func(const float& x, const float& y) const + NCNN_FORCEINLINE float func(const float& x, const float& y) const { return (float)powf(y, x); } #if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const { return pow_ps(y, x); } #if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const { return pow256_ps(y, x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const { return pow512_ps(y, x); } @@ -739,22 +739,22 @@ struct binary_op_rpow struct binary_op_atan2 { - float func(const float& x, const float& y) const + NCNN_FORCEINLINE float func(const float& x, const float& y) const { return (float)atan2f(x, y); } #if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const { return atan2_ps(x, y); } #if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const { return atan2256_ps(x, y); } #if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const { return atan2512_ps(x, y); } @@ -765,22 +765,22 @@ struct binary_op_atan2 struct binary_op_ratan2 { - float func(const float& x, const float& y) const + NCNN_FORCEINLINE float func(const float& x, const float& y) const { return (float)atan2f(y, x); } #if __SSE2__ - __m128 func_pack4(const __m128& x, const __m128& y) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const { return atan2_ps(y, x); } #if __AVX__ - __m256 func_pack8(const __m256& x, const __m256& y) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const { return atan2256_ps(y, x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x, const __m512& y) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const { return atan2512_ps(y, x); } diff --git a/src/layer/x86/padding_pack16.h b/src/layer/x86/padding_pack16.h index e5e4017af46..08d2d236d01 100644 --- a/src/layer/x86/padding_pack16.h +++ b/src/layer/x86/padding_pack16.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -static void padding_constant_pack16_avx512(const Mat& src, Mat& dst, int top, int bottom, int left, int right, __m512 v) +static void padding_constant_pack16_avx512(const Mat& src, Mat& dst, int top, int bottom, int left, int right, const __m512& v) { const float* ptr = src; float* outptr = dst; diff --git a/src/layer/x86/padding_pack4.h b/src/layer/x86/padding_pack4.h index 7f4cad8e07b..9eaaac79843 100644 --- a/src/layer/x86/padding_pack4.h +++ b/src/layer/x86/padding_pack4.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -static void padding_constant_pack4_sse(const Mat& src, Mat& dst, int top, int bottom, int left, int right, __m128 v) +static void padding_constant_pack4_sse(const Mat& src, Mat& dst, int top, int bottom, int left, int right, const __m128& v) { const float* ptr = src; float* outptr = dst; diff --git a/src/layer/x86/padding_pack8.h b/src/layer/x86/padding_pack8.h index 533f0e3ecf5..52a68726ad1 100644 --- a/src/layer/x86/padding_pack8.h +++ b/src/layer/x86/padding_pack8.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -static void padding_constant_pack8_avx(const Mat& src, Mat& dst, int top, int bottom, int left, int right, __m256 v) +static void padding_constant_pack8_avx(const Mat& src, Mat& dst, int top, int bottom, int left, int right, const __m256& v) { const float* ptr = src; float* outptr = dst; diff --git a/src/layer/x86/unaryop_x86.cpp b/src/layer/x86/unaryop_x86.cpp index 1ccd50d601a..e634328d4cb 100644 --- a/src/layer/x86/unaryop_x86.cpp +++ b/src/layer/x86/unaryop_x86.cpp @@ -101,22 +101,22 @@ static int unary_op_inplace(Mat& a, const Option& opt) namespace UnaryOp_x86_functor { struct unary_op_abs { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return (float)fabsf(x); } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return abs_ps(x); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return abs256_ps(x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return abs512_ps(x); } @@ -127,22 +127,22 @@ struct unary_op_abs struct unary_op_neg { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return -x; } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return _mm_sub_ps(_mm_setzero_ps(), x); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return _mm256_sub_ps(_mm256_setzero_ps(), x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return _mm512_sub_ps(_mm512_setzero_ps(), x); } @@ -153,22 +153,22 @@ struct unary_op_neg struct unary_op_floor { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return (float)floorf(x); } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return floor_ps(x); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return _mm256_floor_ps(x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return _mm512_roundscale_ps(x, _MM_FROUND_TO_NEG_INF); } @@ -179,22 +179,22 @@ struct unary_op_floor struct unary_op_ceil { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return (float)ceilf(x); } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return ceil_ps(x); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return _mm256_ceil_ps(x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return _mm512_roundscale_ps(x, _MM_FROUND_TO_POS_INF); } @@ -205,22 +205,22 @@ struct unary_op_ceil struct unary_op_square { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return x * x; } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return _mm_mul_ps(x, x); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return _mm256_mul_ps(x, x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return _mm512_mul_ps(x, x); } @@ -231,22 +231,22 @@ struct unary_op_square struct unary_op_sqrt { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return (float)sqrtf(x); } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return _mm_sqrt_ps(x); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return _mm256_sqrt_ps(x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return _mm512_sqrt_ps(x); } @@ -257,22 +257,22 @@ struct unary_op_sqrt struct unary_op_rsqrt { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return 1.f / sqrtf(x); } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return _mm_rsqrt_ps(x); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return _mm256_rsqrt_ps(x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { __m256 _x0 = _mm512_extractf32x8_ps(x, 0); __m256 _x1 = _mm512_extractf32x8_ps(x, 1); @@ -287,22 +287,22 @@ struct unary_op_rsqrt struct unary_op_exp { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return (float)expf(x); } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return exp_ps(x); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return exp256_ps(x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return exp512_ps(x); } @@ -313,22 +313,22 @@ struct unary_op_exp struct unary_op_log { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return (float)logf(x); } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return log_ps(x); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return log256_ps(x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return log512_ps(x); } @@ -339,22 +339,22 @@ struct unary_op_log struct unary_op_sin { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return (float)sinf(x); } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return sin_ps(x); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return sin256_ps(x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return sin512_ps(x); } @@ -365,22 +365,22 @@ struct unary_op_sin struct unary_op_cos { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return (float)cosf(x); } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return cos_ps(x); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return cos256_ps(x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return cos512_ps(x); } @@ -391,22 +391,22 @@ struct unary_op_cos struct unary_op_tan { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return (float)tanf(x); } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return tan_ps(x); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return tan256_ps(x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return tan512_ps(x); } @@ -417,22 +417,22 @@ struct unary_op_tan struct unary_op_asin { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return (float)asinf(x); } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return asin_ps(x); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return asin256_ps(x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return asin512_ps(x); } @@ -443,22 +443,22 @@ struct unary_op_asin struct unary_op_acos { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return (float)acosf(x); } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return acos_ps(x); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return acos256_ps(x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return acos512_ps(x); } @@ -469,22 +469,22 @@ struct unary_op_acos struct unary_op_atan { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return (float)atanf(x); } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return atan_ps(x); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return atan256_ps(x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return atan512_ps(x); } @@ -495,22 +495,22 @@ struct unary_op_atan struct unary_op_reciprocal { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return 1.f / x; } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return _mm_div_ps(*(__m128*)_ps_1, x); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return _mm256_div_ps(*(__m256*)_ps256_1, x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return _mm512_div_ps(*(__m512*)_ps512_1, x); } @@ -521,22 +521,22 @@ struct unary_op_reciprocal struct unary_op_tanh { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return (float)tanhf(x); } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return tanh_sse(x); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return tanh_avx(x); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return tanh_avx512(x); } @@ -547,22 +547,22 @@ struct unary_op_tanh struct unary_op_log10 { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return (float)log10f(x); } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { return _mm_mul_ps(log_ps(x), _mm_set1_ps(0.434294481903)); } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return _mm256_mul_ps(log256_ps(x), _mm256_set1_ps(0.434294481903)); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return _mm512_mul_ps(log512_ps(x), _mm512_set1_ps(0.434294481903)); } @@ -573,7 +573,7 @@ struct unary_op_log10 struct unary_op_round { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { // round to nearest even // return (x + 12582912.f) - 12582912.f; @@ -588,7 +588,7 @@ struct unary_op_round return y; } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { #if __SSE4_1__ return _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); @@ -597,12 +597,12 @@ struct unary_op_round #endif } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return _mm256_round_ps(x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return _mm512_roundscale_ps(x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } @@ -613,12 +613,12 @@ struct unary_op_round struct unary_op_trunc { - float func(const float& x) const + NCNN_FORCEINLINE float func(const float& x) const { return (float)truncf(x); } #if __SSE2__ - __m128 func_pack4(const __m128& x) const + NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const { #if __SSE4_1__ return _mm_round_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); @@ -627,12 +627,12 @@ struct unary_op_trunc #endif } #if __AVX__ - __m256 func_pack8(const __m256& x) const + NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const { return _mm256_round_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } #if __AVX512F__ - __m512 func_pack16(const __m512& x) const + NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const { return _mm512_roundscale_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); }