diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5c83d56e216..35e46535a3b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -330,7 +330,7 @@ if(NCNN_VULKAN)
     target_link_libraries(ncnn PRIVATE glslang SPIRV)
 endif()
 
-if(NCNN_PLATFORM_API AND ANDROID_NDK)
+if(NCNN_PLATFORM_API AND ANDROID)
     target_link_libraries(ncnn PUBLIC android jnigraphics log)
 endif()
 
diff --git a/src/layer/x86/avx_mathfun.h b/src/layer/x86/avx_mathfun.h
index 65c34efc23e..458bacbc971 100644
--- a/src/layer/x86/avx_mathfun.h
+++ b/src/layer/x86/avx_mathfun.h
@@ -311,7 +311,7 @@ _PS256_CONST(cephes_tanh_p8, 1.18534705686654e-04f);
 _PS256_CONST(cephes_tanh_p9, 2.26843463243900e-03f);
 
 // an approximation of tanh
-static inline __m256 tanh256_ps(const __m256 x)
+static NCNN_FORCEINLINE __m256 tanh256_ps(__m256 x)
 {
     __m256 value = x;
     value = _mm256_max_ps(*(__m256*)_ps256_tanh_lo, value);
diff --git a/src/layer/x86/binaryop_x86.cpp b/src/layer/x86/binaryop_x86.cpp
index 14ad9d5f638..2551f4c2438 100644
--- a/src/layer/x86/binaryop_x86.cpp
+++ b/src/layer/x86/binaryop_x86.cpp
@@ -479,22 +479,22 @@ namespace BinaryOp_x86_functor {
 
 struct binary_op_add
 {
-    float func(const float& x, const float& y) const
+    NCNN_FORCEINLINE float func(const float& x, const float& y) const
     {
         return x + y;
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x, const __m128& y) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const
     {
         return _mm_add_ps(x, y);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x, const __m256& y) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const
     {
         return _mm256_add_ps(x, y);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x, const __m512& y) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const
     {
         return _mm512_add_ps(x, y);
     }
@@ -505,22 +505,22 @@ struct binary_op_add
 
 struct binary_op_sub
 {
-    float func(const float& x, const float& y) const
+    NCNN_FORCEINLINE float func(const float& x, const float& y) const
     {
         return x - y;
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x, const __m128& y) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const
     {
         return _mm_sub_ps(x, y);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x, const __m256& y) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const
     {
         return _mm256_sub_ps(x, y);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x, const __m512& y) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const
     {
         return _mm512_sub_ps(x, y);
     }
@@ -531,22 +531,22 @@ struct binary_op_sub
 
 struct binary_op_mul
 {
-    float func(const float& x, const float& y) const
+    NCNN_FORCEINLINE float func(const float& x, const float& y) const
     {
         return x * y;
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x, const __m128& y) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const
     {
         return _mm_mul_ps(x, y);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x, const __m256& y) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const
     {
         return _mm256_mul_ps(x, y);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x, const __m512& y) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const
     {
         return _mm512_mul_ps(x, y);
     }
@@ -557,22 +557,22 @@ struct binary_op_mul
 
 struct binary_op_div
 {
-    float func(const float& x, const float& y) const
+    NCNN_FORCEINLINE float func(const float& x, const float& y) const
     {
         return x / y;
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x, const __m128& y) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const
     {
         return _mm_div_ps(x, y);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x, const __m256& y) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const
     {
         return _mm256_div_ps(x, y);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x, const __m512& y) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const
     {
         return _mm512_div_ps(x, y);
     }
@@ -583,22 +583,22 @@ struct binary_op_div
 
 struct binary_op_max
 {
-    float func(const float& x, const float& y) const
+    NCNN_FORCEINLINE float func(const float& x, const float& y) const
     {
         return std::max(x, y);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x, const __m128& y) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const
     {
         return _mm_max_ps(x, y);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x, const __m256& y) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const
     {
         return _mm256_max_ps(x, y);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x, const __m512& y) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const
     {
         return _mm512_max_ps(x, y);
     }
@@ -609,22 +609,22 @@ struct binary_op_max
 
 struct binary_op_min
 {
-    float func(const float& x, const float& y) const
+    NCNN_FORCEINLINE float func(const float& x, const float& y) const
     {
         return std::min(x, y);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x, const __m128& y) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const
     {
         return _mm_min_ps(x, y);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x, const __m256& y) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const
     {
         return _mm256_min_ps(x, y);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x, const __m512& y) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const
     {
         return _mm512_min_ps(x, y);
     }
@@ -635,22 +635,22 @@ struct binary_op_min
 
 struct binary_op_pow
 {
-    float func(const float& x, const float& y) const
+    NCNN_FORCEINLINE float func(const float& x, const float& y) const
     {
         return (float)powf(x, y);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x, const __m128& y) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const
     {
         return pow_ps(x, y);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x, const __m256& y) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const
     {
         return pow256_ps(x, y);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x, const __m512& y) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const
     {
         return pow512_ps(x, y);
     }
@@ -661,22 +661,22 @@ struct binary_op_pow
 
 struct binary_op_rsub
 {
-    float func(const float& x, const float& y) const
+    NCNN_FORCEINLINE float func(const float& x, const float& y) const
     {
         return y - x;
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x, const __m128& y) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const
     {
         return _mm_sub_ps(y, x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x, const __m256& y) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const
     {
         return _mm256_sub_ps(y, x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x, const __m512& y) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const
     {
         return _mm512_sub_ps(y, x);
     }
@@ -687,22 +687,22 @@ struct binary_op_rsub
 
 struct binary_op_rdiv
 {
-    float func(const float& x, const float& y) const
+    NCNN_FORCEINLINE float func(const float& x, const float& y) const
     {
         return y / x;
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x, const __m128& y) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const
     {
         return _mm_div_ps(y, x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x, const __m256& y) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const
     {
         return _mm256_div_ps(y, x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x, const __m512& y) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const
     {
         return _mm512_div_ps(y, x);
     }
@@ -713,22 +713,22 @@ struct binary_op_rdiv
 
 struct binary_op_rpow
 {
-    float func(const float& x, const float& y) const
+    NCNN_FORCEINLINE float func(const float& x, const float& y) const
     {
         return (float)powf(y, x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x, const __m128& y) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const
     {
         return pow_ps(y, x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x, const __m256& y) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const
     {
         return pow256_ps(y, x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x, const __m512& y) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const
     {
         return pow512_ps(y, x);
     }
@@ -739,22 +739,22 @@ struct binary_op_rpow
 
 struct binary_op_atan2
 {
-    float func(const float& x, const float& y) const
+    NCNN_FORCEINLINE float func(const float& x, const float& y) const
     {
         return (float)atan2f(x, y);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x, const __m128& y) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const
     {
         return atan2_ps(x, y);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x, const __m256& y) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const
     {
         return atan2256_ps(x, y);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x, const __m512& y) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const
     {
         return atan2512_ps(x, y);
     }
@@ -765,22 +765,22 @@ struct binary_op_atan2
 
 struct binary_op_ratan2
 {
-    float func(const float& x, const float& y) const
+    NCNN_FORCEINLINE float func(const float& x, const float& y) const
     {
         return (float)atan2f(y, x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x, const __m128& y) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x, const __m128& y) const
     {
         return atan2_ps(y, x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x, const __m256& y) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x, const __m256& y) const
     {
         return atan2256_ps(y, x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x, const __m512& y) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x, const __m512& y) const
     {
         return atan2512_ps(y, x);
     }
diff --git a/src/layer/x86/padding_pack16.h b/src/layer/x86/padding_pack16.h
index e5e4017af46..08d2d236d01 100644
--- a/src/layer/x86/padding_pack16.h
+++ b/src/layer/x86/padding_pack16.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-static void padding_constant_pack16_avx512(const Mat& src, Mat& dst, int top, int bottom, int left, int right, __m512 v)
+static void padding_constant_pack16_avx512(const Mat& src, Mat& dst, int top, int bottom, int left, int right, const __m512& v)
 {
     const float* ptr = src;
     float* outptr = dst;
diff --git a/src/layer/x86/padding_pack4.h b/src/layer/x86/padding_pack4.h
index 7f4cad8e07b..9eaaac79843 100644
--- a/src/layer/x86/padding_pack4.h
+++ b/src/layer/x86/padding_pack4.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-static void padding_constant_pack4_sse(const Mat& src, Mat& dst, int top, int bottom, int left, int right, __m128 v)
+static void padding_constant_pack4_sse(const Mat& src, Mat& dst, int top, int bottom, int left, int right, const __m128& v)
 {
     const float* ptr = src;
     float* outptr = dst;
diff --git a/src/layer/x86/padding_pack8.h b/src/layer/x86/padding_pack8.h
index 533f0e3ecf5..52a68726ad1 100644
--- a/src/layer/x86/padding_pack8.h
+++ b/src/layer/x86/padding_pack8.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-static void padding_constant_pack8_avx(const Mat& src, Mat& dst, int top, int bottom, int left, int right, __m256 v)
+static void padding_constant_pack8_avx(const Mat& src, Mat& dst, int top, int bottom, int left, int right, const __m256& v)
 {
     const float* ptr = src;
     float* outptr = dst;
diff --git a/src/layer/x86/unaryop_x86.cpp b/src/layer/x86/unaryop_x86.cpp
index 1ccd50d601a..e634328d4cb 100644
--- a/src/layer/x86/unaryop_x86.cpp
+++ b/src/layer/x86/unaryop_x86.cpp
@@ -101,22 +101,22 @@ static int unary_op_inplace(Mat& a, const Option& opt)
 namespace UnaryOp_x86_functor {
 struct unary_op_abs
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return (float)fabsf(x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return abs_ps(x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return abs256_ps(x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return abs512_ps(x);
     }
@@ -127,22 +127,22 @@ struct unary_op_abs
 
 struct unary_op_neg
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return -x;
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return _mm_sub_ps(_mm_setzero_ps(), x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return _mm256_sub_ps(_mm256_setzero_ps(), x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return _mm512_sub_ps(_mm512_setzero_ps(), x);
     }
@@ -153,22 +153,22 @@ struct unary_op_neg
 
 struct unary_op_floor
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return (float)floorf(x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return floor_ps(x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return _mm256_floor_ps(x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return _mm512_roundscale_ps(x, _MM_FROUND_TO_NEG_INF);
     }
@@ -179,22 +179,22 @@ struct unary_op_floor
 
 struct unary_op_ceil
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return (float)ceilf(x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return ceil_ps(x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return _mm256_ceil_ps(x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return _mm512_roundscale_ps(x, _MM_FROUND_TO_POS_INF);
     }
@@ -205,22 +205,22 @@ struct unary_op_ceil
 
 struct unary_op_square
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return x * x;
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return _mm_mul_ps(x, x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return _mm256_mul_ps(x, x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return _mm512_mul_ps(x, x);
     }
@@ -231,22 +231,22 @@ struct unary_op_square
 
 struct unary_op_sqrt
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return (float)sqrtf(x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return _mm_sqrt_ps(x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return _mm256_sqrt_ps(x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return _mm512_sqrt_ps(x);
     }
@@ -257,22 +257,22 @@ struct unary_op_sqrt
 
 struct unary_op_rsqrt
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return 1.f / sqrtf(x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return _mm_rsqrt_ps(x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return _mm256_rsqrt_ps(x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         __m256 _x0 = _mm512_extractf32x8_ps(x, 0);
         __m256 _x1 = _mm512_extractf32x8_ps(x, 1);
@@ -287,22 +287,22 @@ struct unary_op_rsqrt
 
 struct unary_op_exp
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return (float)expf(x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return exp_ps(x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return exp256_ps(x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return exp512_ps(x);
     }
@@ -313,22 +313,22 @@ struct unary_op_exp
 
 struct unary_op_log
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return (float)logf(x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return log_ps(x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return log256_ps(x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return log512_ps(x);
     }
@@ -339,22 +339,22 @@ struct unary_op_log
 
 struct unary_op_sin
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return (float)sinf(x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return sin_ps(x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return sin256_ps(x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return sin512_ps(x);
     }
@@ -365,22 +365,22 @@ struct unary_op_sin
 
 struct unary_op_cos
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return (float)cosf(x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return cos_ps(x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return cos256_ps(x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return cos512_ps(x);
     }
@@ -391,22 +391,22 @@ struct unary_op_cos
 
 struct unary_op_tan
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return (float)tanf(x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return tan_ps(x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return tan256_ps(x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return tan512_ps(x);
     }
@@ -417,22 +417,22 @@ struct unary_op_tan
 
 struct unary_op_asin
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return (float)asinf(x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return asin_ps(x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return asin256_ps(x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return asin512_ps(x);
     }
@@ -443,22 +443,22 @@ struct unary_op_asin
 
 struct unary_op_acos
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return (float)acosf(x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return acos_ps(x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return acos256_ps(x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return acos512_ps(x);
     }
@@ -469,22 +469,22 @@ struct unary_op_acos
 
 struct unary_op_atan
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return (float)atanf(x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return atan_ps(x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return atan256_ps(x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return atan512_ps(x);
     }
@@ -495,22 +495,22 @@ struct unary_op_atan
 
 struct unary_op_reciprocal
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return 1.f / x;
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return _mm_div_ps(*(__m128*)_ps_1, x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return _mm256_div_ps(*(__m256*)_ps256_1, x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return _mm512_div_ps(*(__m512*)_ps512_1, x);
     }
@@ -521,22 +521,22 @@ struct unary_op_reciprocal
 
 struct unary_op_tanh
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return (float)tanhf(x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return tanh_sse(x);
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return tanh_avx(x);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return tanh_avx512(x);
     }
@@ -547,22 +547,22 @@ struct unary_op_tanh
 
 struct unary_op_log10
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return (float)log10f(x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
         return _mm_mul_ps(log_ps(x), _mm_set1_ps(0.434294481903));
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return _mm256_mul_ps(log256_ps(x), _mm256_set1_ps(0.434294481903));
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return _mm512_mul_ps(log512_ps(x), _mm512_set1_ps(0.434294481903));
     }
@@ -573,7 +573,7 @@ struct unary_op_log10
 
 struct unary_op_round
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         // round to nearest even
         // return (x + 12582912.f) - 12582912.f;
@@ -588,7 +588,7 @@ struct unary_op_round
         return y;
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
 #if __SSE4_1__
         return _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
@@ -597,12 +597,12 @@ struct unary_op_round
 #endif
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return _mm256_round_ps(x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return _mm512_roundscale_ps(x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
     }
@@ -613,12 +613,12 @@ struct unary_op_round
 
 struct unary_op_trunc
 {
-    float func(const float& x) const
+    NCNN_FORCEINLINE float func(const float& x) const
     {
         return (float)truncf(x);
     }
 #if __SSE2__
-    __m128 func_pack4(const __m128& x) const
+    NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const
     {
 #if __SSE4_1__
         return _mm_round_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
@@ -627,12 +627,12 @@ struct unary_op_trunc
 #endif
     }
 #if __AVX__
-    __m256 func_pack8(const __m256& x) const
+    NCNN_FORCEINLINE __m256 func_pack8(const __m256& x) const
     {
         return _mm256_round_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
     }
 #if __AVX512F__
-    __m512 func_pack16(const __m512& x) const
+    NCNN_FORCEINLINE __m512 func_pack16(const __m512& x) const
     {
         return _mm512_roundscale_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
     }