From 44b9c428430ae8a6cfeaf84a823c22d18617ccf3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 17 Mar 2024 11:45:30 -0700 Subject: [PATCH 01/10] checksum_benchmarks.sh: handle adler32_arm_neon_dotprod() --- scripts/checksum_benchmarks.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh index 94f281aa..f39dc2da 100755 --- a/scripts/checksum_benchmarks.sh +++ b/scripts/checksum_benchmarks.sh @@ -226,9 +226,15 @@ arm*) fi ;; aarch*) + EXTRA_CFLAGS=("-march=armv8-a") + if have_cpu_features asimd asimddp; then + do_benchmark "DOTPROD" + disable_cpu_feature dotprod + fi if have_cpu_features asimd; then do_benchmark "NEON" - disable_cpu_feature neon "-march=armv8-a+nosimd" + disable_cpu_feature neon + EXTRA_CFLAGS=("-march=armv8-a+nosimd") fi ;; esac From c1926a4d6a715b2fa502311ef8b7f2321d096aa5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 17 Mar 2024 11:45:30 -0700 Subject: [PATCH 02/10] lib/arm: move selection of pmull_wide into arm_cpu_features Handle the selection of crc32_arm_pmullx12_crc using a CPU feature flag, similar to X86_CPU_FEATURE_ZMM. This allows the code to be tested on platforms other than macOS. --- lib/arm/cpu_features.c | 15 ++++++++++++ lib/arm/cpu_features.h | 12 ++++++--- lib/arm/crc32_impl.h | 45 +++++++--------------------------- lib/arm/crc32_pmull_wide.h | 2 +- scripts/checksum_benchmarks.sh | 20 +++++++++++---- scripts/run_tests.sh | 2 +- 6 files changed, 50 insertions(+), 46 deletions(-) diff --git a/lib/arm/cpu_features.c b/lib/arm/cpu_features.c index 72ab03da..2cd44452 100644 --- a/lib/arm/cpu_features.c +++ b/lib/arm/cpu_features.c @@ -138,6 +138,7 @@ static u32 query_arm_cpu_features(void) #include #include +#include static const struct { const char *name; @@ -192,6 +193,7 @@ static u32 query_arm_cpu_features(void) static const struct cpu_feature arm_cpu_feature_table[] = { {ARM_CPU_FEATURE_NEON, "neon"}, {ARM_CPU_FEATURE_PMULL, "pmull"}, + {ARM_CPU_FEATURE_PREFER_PMULL, "prefer_pmull"}, {ARM_CPU_FEATURE_CRC32, "crc32"}, {ARM_CPU_FEATURE_SHA3, "sha3"}, {ARM_CPU_FEATURE_DOTPROD, "dotprod"}, @@ -203,6 +205,19 @@ void libdeflate_init_arm_cpu_features(void) { u32 features = query_arm_cpu_features(); + /* + * On the Apple M1 processor, crc32 instructions max out at about 25.5 + * GB/s in the best case of using a 3-way or greater interleaved chunked + * implementation, whereas a pmull-based implementation achieves 68 GB/s + * provided that the stride length is large enough (about 10+ vectors + * with eor3, or 12+ without). + * + * Assume that crc32 instructions are preferable in other cases. + */ +#if (defined(__APPLE__) && TARGET_OS_OSX) || defined(TEST_SUPPORT__DO_NOT_USE) + features |= ARM_CPU_FEATURE_PREFER_PMULL; +#endif + disable_cpu_features_for_testing(&features, arm_cpu_feature_table, ARRAY_LEN(arm_cpu_feature_table)); diff --git a/lib/arm/cpu_features.h b/lib/arm/cpu_features.h index 30920c6b..b1ef6a85 100644 --- a/lib/arm/cpu_features.h +++ b/lib/arm/cpu_features.h @@ -45,9 +45,15 @@ #define ARM_CPU_FEATURE_NEON (1 << 0) #define ARM_CPU_FEATURE_PMULL (1 << 1) -#define ARM_CPU_FEATURE_CRC32 (1 << 2) -#define ARM_CPU_FEATURE_SHA3 (1 << 3) -#define ARM_CPU_FEATURE_DOTPROD (1 << 4) +/* + * PREFER_PMULL indicates that the CPU has very high pmull throughput, and so + * the 12x wide pmull-based CRC-32 implementation is likely to be faster than an + * implementation based on the crc32 instructions. + */ +#define ARM_CPU_FEATURE_PREFER_PMULL (1 << 2) +#define ARM_CPU_FEATURE_CRC32 (1 << 3) +#define ARM_CPU_FEATURE_SHA3 (1 << 4) +#define ARM_CPU_FEATURE_DOTPROD (1 << 5) #define HAVE_NEON(features) (HAVE_NEON_NATIVE || ((features) & ARM_CPU_FEATURE_NEON)) #define HAVE_PMULL(features) (HAVE_PMULL_NATIVE || ((features) & ARM_CPU_FEATURE_PMULL)) diff --git a/lib/arm/crc32_impl.h b/lib/arm/crc32_impl.h index d52954a6..47bce01f 100644 --- a/lib/arm/crc32_impl.h +++ b/lib/arm/crc32_impl.h @@ -113,7 +113,7 @@ combine_crcs_slow(u32 crc0, u32 crc1, u32 crc2, u32 crc3) } #define crc32_arm_crc crc32_arm_crc -static ATTRIBUTES MAYBE_UNUSED u32 +static ATTRIBUTES u32 crc32_arm_crc(u32 crc, const u8 *p, size_t len) { if (len >= 64) { @@ -289,7 +289,7 @@ combine_crcs_fast(u32 crc0, u32 crc1, u32 crc2, u32 crc3, size_t i) } #define crc32_arm_crc_pmullcombine crc32_arm_crc_pmullcombine -static ATTRIBUTES MAYBE_UNUSED u32 +static ATTRIBUTES u32 crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len) { const size_t align = -(uintptr_t)p & 7; @@ -470,7 +470,7 @@ crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len) # define ENABLE_EOR3 0 # include "crc32_pmull_helpers.h" -static ATTRIBUTES MAYBE_UNUSED u32 +static ATTRIBUTES u32 crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) { static const u64 _aligned_attribute(16) mults[3][2] = { @@ -621,45 +621,19 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) # include "crc32_pmull_wide.h" #endif -/* - * On the Apple M1 processor, crc32 instructions max out at about 25.5 GB/s in - * the best case of using a 3-way or greater interleaved chunked implementation, - * whereas a pmull-based implementation achieves 68 GB/s provided that the - * stride length is large enough (about 10+ vectors with eor3, or 12+ without). - * - * For now we assume that crc32 instructions are preferable in other cases. - */ -#define PREFER_PMULL_TO_CRC 0 -#ifdef __APPLE__ -# include -# if TARGET_OS_OSX -# undef PREFER_PMULL_TO_CRC -# define PREFER_PMULL_TO_CRC 1 -# endif -#endif - -/* - * If the best implementation is statically available, use it unconditionally. - * Otherwise choose the best implementation at runtime. - */ -#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc_eor3) && \ - HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE -# define DEFAULT_IMPL crc32_arm_pmullx12_crc_eor3 -#elif !PREFER_PMULL_TO_CRC && defined(crc32_arm_crc_pmullcombine) && \ - HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE -# define DEFAULT_IMPL crc32_arm_crc_pmullcombine -#else static inline crc32_func_t arch_select_crc32_func(void) { const u32 features MAYBE_UNUSED = get_arm_cpu_features(); -#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc_eor3) - if (HAVE_PMULL(features) && HAVE_CRC32(features) && HAVE_SHA3(features)) +#ifdef crc32_arm_pmullx12_crc_eor3 + if ((features & ARM_CPU_FEATURE_PREFER_PMULL) && + HAVE_PMULL(features) && HAVE_CRC32(features) && HAVE_SHA3(features)) return crc32_arm_pmullx12_crc_eor3; #endif -#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc) - if (HAVE_PMULL(features) && HAVE_CRC32(features)) +#ifdef crc32_arm_pmullx12_crc + if ((features & ARM_CPU_FEATURE_PREFER_PMULL) && + HAVE_PMULL(features) && HAVE_CRC32(features)) return crc32_arm_pmullx12_crc; #endif #ifdef crc32_arm_crc_pmullcombine @@ -677,6 +651,5 @@ arch_select_crc32_func(void) return NULL; } #define arch_select_crc32_func arch_select_crc32_func -#endif #endif /* LIB_ARM_CRC32_IMPL_H */ diff --git a/lib/arm/crc32_pmull_wide.h b/lib/arm/crc32_pmull_wide.h index 5a4bd0ca..076f0479 100644 --- a/lib/arm/crc32_pmull_wide.h +++ b/lib/arm/crc32_pmull_wide.h @@ -52,7 +52,7 @@ #include "crc32_pmull_helpers.h" -static ATTRIBUTES MAYBE_UNUSED u32 +static ATTRIBUTES u32 ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len) { uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11; diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh index f39dc2da..54664709 100755 --- a/scripts/checksum_benchmarks.sh +++ b/scripts/checksum_benchmarks.sh @@ -173,14 +173,24 @@ i386|x86_64) disable_cpu_feature pclmulqdq "-mno-pclmul" fi ;; -arm*|aarch*) +aarch*) + EXTRA_CFLAGS=("-march=armv8-a") + if have_cpu_features pmull crc32 sha3; then + do_benchmark "pmullx12_crc_eor3" + disable_cpu_feature sha3 + fi + if have_cpu_features pmull crc32; then + do_benchmark "pmullx12_crc" + disable_cpu_feature prefer_pmull + do_benchmark "crc_pmullcombine" + fi if have_cpu_features crc32; then - do_benchmark "ARM" - disable_cpu_feature crc32 "-march=armv8-a+nocrc" + do_benchmark "crc" + disable_cpu_feature crc32 fi if have_cpu_features pmull; then - do_benchmark "PMULL" - disable_cpu_feature pmull "-march=armv8-a+nocrc+nocrypto" + do_benchmark "pmull4x" + disable_cpu_feature pmull fi ;; esac diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index 4419bc06..0bf5b62a 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -146,7 +146,7 @@ build_and_run_tests() avx2 avx bmi2 pclmulqdq sse2) ;; arm*|aarch*) - features+=(dotprod sha3 crc32 pmull neon) + features+=(dotprod sha3 prefer_pmull crc32 pmull neon) ;; esac fi From 7129f4b97571f4e356a098fd05f8dbcffc2e89e0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 17 Mar 2024 11:45:30 -0700 Subject: [PATCH 03/10] lib/arm: drop the arm32 support for pmull and crc32 instructions Drop support for the pmull and crc32 optimized CRC-32 functions when building for 32-bit ARM. Not many people care about 32-bit ARM these days, and these optimizations were always a struggle to keep working on 32-bit due to compiler issues. They also only ever applied to processors that support 64-bit too. --- lib/arm/cpu_features.c | 4 -- lib/arm/cpu_features.h | 60 +++++------------------------- lib/arm/crc32_impl.h | 69 ++++++++++------------------------- lib/arm/crc32_pmull_helpers.h | 28 +++----------- 4 files changed, 34 insertions(+), 127 deletions(-) diff --git a/lib/arm/cpu_features.c b/lib/arm/cpu_features.c index 2cd44452..3c1b85cb 100644 --- a/lib/arm/cpu_features.c +++ b/lib/arm/cpu_features.c @@ -113,10 +113,6 @@ static u32 query_arm_cpu_features(void) STATIC_ASSERT(sizeof(long) == 4); if (hwcap & (1 << 12)) /* HWCAP_NEON */ features |= ARM_CPU_FEATURE_NEON; - if (hwcap2 & (1 << 1)) /* HWCAP2_PMULL */ - features |= ARM_CPU_FEATURE_PMULL; - if (hwcap2 & (1 << 4)) /* HWCAP2_CRC32 */ - features |= ARM_CPU_FEATURE_CRC32; #else STATIC_ASSERT(sizeof(long) == 8); if (hwcap & (1 << 1)) /* HWCAP_ASIMD */ diff --git a/lib/arm/cpu_features.h b/lib/arm/cpu_features.h index b1ef6a85..9f4b4e77 100644 --- a/lib/arm/cpu_features.h +++ b/lib/arm/cpu_features.h @@ -101,17 +101,10 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #else # define HAVE_PMULL_NATIVE 0 #endif -#if HAVE_PMULL_NATIVE || \ - (HAVE_DYNAMIC_ARM_CPU_FEATURES && \ - HAVE_NEON_INTRIN /* needed to exclude soft float arm32 case */ && \ - (GCC_PREREQ(6, 1) || defined(__clang__) || defined(_MSC_VER)) && \ - /* - * On arm32 with clang, the crypto intrinsics (which include pmull) - * are not defined, even when using -mfpu=crypto-neon-fp-armv8, - * because clang's puts their definitions behind - * __aarch64__. - */ \ - !(defined(ARCH_ARM32) && defined(__clang__))) +#if defined(ARCH_ARM64) && \ + (HAVE_PMULL_NATIVE || \ + (HAVE_DYNAMIC_ARM_CPU_FEATURES && \ + (GCC_PREREQ(6, 1) || defined(__clang__) || defined(_MSC_VER)))) # define HAVE_PMULL_INTRIN CPU_IS_LITTLE_ENDIAN() /* untested on big endian */ /* Work around MSVC's vmull_p64() taking poly64x1_t instead of poly64_t */ # ifdef _MSC_VER @@ -152,43 +145,10 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #else # define HAVE_CRC32_NATIVE 0 #endif -#undef HAVE_CRC32_INTRIN -#if HAVE_CRC32_NATIVE +#if defined(ARCH_ARM64) && (HAVE_CRC32_NATIVE || defined(__GNUC__) || \ + defined(__clang__) || defined(_MSC_VER)) # define HAVE_CRC32_INTRIN 1 -#elif HAVE_DYNAMIC_ARM_CPU_FEATURES -# if GCC_PREREQ(1, 0) - /* - * Support for ARM CRC32 intrinsics when CRC32 instructions are not enabled - * in the main target has been affected by two gcc bugs, which we must avoid - * by only allowing gcc versions that have the corresponding fixes. First, - * gcc commit 943766d37ae4 ("[arm] Fix use of CRC32 intrinsics with Armv8-a - * and hard-float"), i.e. gcc 8.4+, 9.3+, 10.1+, or 11+, is needed. Second, - * gcc commit c1cdabe3aab8 ("arm: reorder assembler architecture directives - * [PR101723]"), i.e. gcc 9.5+, 10.4+, 11.3+, or 12+, is needed when - * binutils is 2.34 or later, due to - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104439. We use the second - * set of prerequisites, as they are stricter and we have no way to detect - * the binutils version directly from a C source file. - * - * Also exclude the cases where the main target arch is armv6kz or armv7e-m. - * In those cases, gcc doesn't let functions that use the main arch be - * inlined into functions that are targeted to armv8-a+crc. (armv8-a is - * necessary for crc to be accepted at all.) That causes build errors. - * This issue happens for these specific sub-archs because they are not a - * subset of armv8-a. Note: clang does not have this limitation. - */ -# if (GCC_PREREQ(11, 3) || \ - (GCC_PREREQ(10, 4) && !GCC_PREREQ(11, 0)) || \ - (GCC_PREREQ(9, 5) && !GCC_PREREQ(10, 0))) && \ - !defined(__ARM_ARCH_6KZ__) && \ - !defined(__ARM_ARCH_7EM__) -# define HAVE_CRC32_INTRIN 1 -# endif -# elif defined(__clang__) || defined(_MSC_VER) -# define HAVE_CRC32_INTRIN 1 -# endif -#endif -#ifndef HAVE_CRC32_INTRIN +#else # define HAVE_CRC32_INTRIN 0 #endif @@ -239,8 +199,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; } * defined, though, so work around this by temporarily defining the * corresponding __ARM_FEATURE_* macros while including the headers. */ -#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \ - (defined(__clang__) || defined(ARCH_ARM32)) +#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && defined(__clang__) # define __ARM_FEATURE_CRC32 1 #endif #if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__) @@ -249,8 +208,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__) # define __ARM_FEATURE_DOTPROD 1 #endif -#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \ - (defined(__clang__) || defined(ARCH_ARM32)) +#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && defined(__clang__) # include # undef __ARM_FEATURE_CRC32 #endif diff --git a/lib/arm/crc32_impl.h b/lib/arm/crc32_impl.h index 47bce01f..b23c9353 100644 --- a/lib/arm/crc32_impl.h +++ b/lib/arm/crc32_impl.h @@ -47,27 +47,10 @@ #if HAVE_CRC32_INTRIN # if HAVE_CRC32_NATIVE # define ATTRIBUTES +# elif defined(__clang__) +# define ATTRIBUTES _target_attribute("crc") # else -# ifdef ARCH_ARM32 -# ifdef __clang__ -# define ATTRIBUTES _target_attribute("armv8-a,crc") -# elif defined(__ARM_PCS_VFP) - /* - * +simd is needed to avoid a "selected architecture lacks an FPU" - * error with Debian arm-linux-gnueabihf-gcc when -mfpu is not - * explicitly specified on the command line. - */ -# define ATTRIBUTES _target_attribute("arch=armv8-a+crc+simd") -# else -# define ATTRIBUTES _target_attribute("arch=armv8-a+crc") -# endif -# else -# ifdef __clang__ -# define ATTRIBUTES _target_attribute("crc") -# else -# define ATTRIBUTES _target_attribute("+crc") -# endif -# endif +# define ATTRIBUTES _target_attribute("+crc") # endif #ifndef _MSC_VER @@ -245,16 +228,10 @@ crc32_arm_crc(u32 crc, const u8 *p, size_t len) #if HAVE_CRC32_INTRIN && HAVE_PMULL_INTRIN # if HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE # define ATTRIBUTES +# elif defined(__clang__) +# define ATTRIBUTES _target_attribute("crc,aes") # else -# ifdef ARCH_ARM32 -# define ATTRIBUTES _target_attribute("arch=armv8-a+crc,fpu=crypto-neon-fp-armv8") -# else -# ifdef __clang__ -# define ATTRIBUTES _target_attribute("crc,aes") -# else -# define ATTRIBUTES _target_attribute("+crc,+crypto") -# endif -# endif +# define ATTRIBUTES _target_attribute("+crc,+crypto") # endif #ifndef _MSC_VER @@ -447,25 +424,19 @@ crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len) # define SUFFIX _pmullx4 # if HAVE_PMULL_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE # define ATTRIBUTES +# elif defined(__clang__) + /* + * This used to use "crypto", but that stopped working with clang 16. + * Now only "aes" works. "aes" works with older versions too, so use + * that. No "+" prefix; clang 15 and earlier doesn't accept that. + */ +# define ATTRIBUTES _target_attribute("aes") # else -# ifdef ARCH_ARM32 -# define ATTRIBUTES _target_attribute("fpu=crypto-neon-fp-armv8") -# else -# ifdef __clang__ - /* - * This used to use "crypto", but that stopped working with clang 16. - * Now only "aes" works. "aes" works with older versions too, so use - * that. No "+" prefix; clang 15 and earlier doesn't accept that. - */ -# define ATTRIBUTES _target_attribute("aes") -# else - /* - * With gcc, only "+crypto" works. Both the "+" prefix and the - * "crypto" (not "aes") are essential... - */ -# define ATTRIBUTES _target_attribute("+crypto") -# endif -# endif + /* + * With gcc, only "+crypto" works. Both the "+" prefix and the + * "crypto" (not "aes") are essential... + */ +# define ATTRIBUTES _target_attribute("+crypto") # endif # define ENABLE_EOR3 0 # include "crc32_pmull_helpers.h" @@ -571,7 +542,7 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) * * See crc32_pmull_wide.h for explanation. */ -#if defined(ARCH_ARM64) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN +#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN # define crc32_arm_pmullx12_crc crc32_arm_pmullx12_crc # define SUFFIX _pmullx12_crc # if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE @@ -596,7 +567,7 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) * Note: we require HAVE_SHA3_TARGET (or HAVE_SHA3_NATIVE) rather than * HAVE_SHA3_INTRIN, as we have an inline asm fallback for eor3. */ -#if defined(ARCH_ARM64) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && \ +#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && \ (HAVE_SHA3_TARGET || HAVE_SHA3_NATIVE) # define crc32_arm_pmullx12_crc_eor3 crc32_arm_pmullx12_crc_eor3 # define SUFFIX _pmullx12_crc_eor3 diff --git a/lib/arm/crc32_pmull_helpers.h b/lib/arm/crc32_pmull_helpers.h index 1cd1cc18..a448b7e7 100644 --- a/lib/arm/crc32_pmull_helpers.h +++ b/lib/arm/crc32_pmull_helpers.h @@ -73,7 +73,7 @@ ADD_SUFFIX(clmul_low)(uint8x16_t a, poly64x2_t b) static forceinline ATTRIBUTES uint8x16_t ADD_SUFFIX(clmul_high)(uint8x16_t a, poly64x2_t b) { -#if defined(__clang__) && defined(ARCH_ARM64) +#ifdef __clang__ /* * Use inline asm to ensure that pmull2 is really used. This works * around clang bug https://github.com/llvm/llvm-project/issues/52868. @@ -119,24 +119,6 @@ ADD_SUFFIX(fold_vec)(uint8x16_t src, uint8x16_t dst, poly64x2_t multipliers) } #define fold_vec ADD_SUFFIX(fold_vec) -#undef vtbl -static forceinline ATTRIBUTES uint8x16_t -ADD_SUFFIX(vtbl)(uint8x16_t table, uint8x16_t indices) -{ -#ifdef ARCH_ARM64 - return vqtbl1q_u8(table, indices); -#else - uint8x8x2_t tab2; - - tab2.val[0] = vget_low_u8(table); - tab2.val[1] = vget_high_u8(table); - - return vcombine_u8(vtbl2_u8(tab2, vget_low_u8(indices)), - vtbl2_u8(tab2, vget_high_u8(indices))); -#endif -} -#define vtbl ADD_SUFFIX(vtbl) - /* * Given v containing a 16-byte polynomial, and a pointer 'p' that points to the * next '1 <= len <= 15' data bytes, rearrange the concatenation of v and the @@ -150,8 +132,8 @@ ADD_SUFFIX(fold_partial_vec)(uint8x16_t v, const u8 *p, size_t len, poly64x2_t multipliers_1) { /* - * vtbl(v, shift_tab[len..len+15]) left shifts v by 16-len bytes. - * vtbl(v, shift_tab[len+16..len+31]) right shifts v by len bytes. + * vqtbl1q_u8(v, shift_tab[len..len+15]) left shifts v by 16-len bytes. + * vqtbl1q_u8(v, shift_tab[len+16..len+31]) right shifts v by len bytes. */ static const u8 shift_tab[48] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, @@ -166,7 +148,7 @@ ADD_SUFFIX(fold_partial_vec)(uint8x16_t v, const u8 *p, size_t len, uint8x16_t x0, x1, bsl_mask; /* x0 = v left-shifted by '16 - len' bytes */ - x0 = vtbl(v, lshift); + x0 = vqtbl1q_u8(v, lshift); /* Create a vector of '16 - len' 0x00 bytes, then 'len' 0xff bytes. */ bsl_mask = vreinterpretq_u8_s8( @@ -177,7 +159,7 @@ ADD_SUFFIX(fold_partial_vec)(uint8x16_t v, const u8 *p, size_t len, * bytes) followed by the remaining data. */ x1 = vbslq_u8(bsl_mask /* 0 bits select from arg3, 1 bits from arg2 */, - vld1q_u8(p + len - 16), vtbl(v, rshift)); + vld1q_u8(p + len - 16), vqtbl1q_u8(v, rshift)); return fold_vec(x0, x1, multipliers_1); } From 3a352279feea2d49a48f2f1032fefc82dc3f4143 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 17 Mar 2024 11:45:30 -0700 Subject: [PATCH 04/10] lib/arm: simplify by not trying to skip target attributes As was done in lib/x86/, use the target function attribute even if the features are available natively, as this has no known downside. Exception: this cannot be done for plain simd (NEON), since old versions of clang don't accept the target attribute for it. --- lib/arm/adler32_impl.h | 40 ++++++++++++++--------------- lib/arm/cpu_features.h | 23 ----------------- lib/arm/crc32_impl.h | 58 +++++++++++++++--------------------------- 3 files changed, 41 insertions(+), 80 deletions(-) diff --git a/lib/arm/adler32_impl.h b/lib/arm/adler32_impl.h index e411fd3b..d26f5e13 100644 --- a/lib/arm/adler32_impl.h +++ b/lib/arm/adler32_impl.h @@ -32,15 +32,19 @@ /* Regular NEON implementation */ #if HAVE_NEON_INTRIN && CPU_IS_LITTLE_ENDIAN() -# define adler32_arm_neon adler32_arm_neon +# define adler32_arm_neon adler32_arm_neon # if HAVE_NEON_NATIVE + /* + * Use no attributes if none are needed, to support old versions of clang + * that don't accept the simd target attribute. + */ # define ATTRIBUTES +# elif defined(ARCH_ARM32) +# define ATTRIBUTES _target_attribute("fpu=neon") +# elif defined(__clang__) +# define ATTRIBUTES _target_attribute("simd") # else -# ifdef ARCH_ARM32 -# define ATTRIBUTES _target_attribute("fpu=neon") -# else -# define ATTRIBUTES _target_attribute("+simd") -# endif +# define ATTRIBUTES _target_attribute("+simd") # endif # include static ATTRIBUTES MAYBE_UNUSED u32 @@ -208,21 +212,17 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len) /* NEON+dotprod implementation */ #if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN() # define adler32_arm_neon_dotprod adler32_arm_neon_dotprod -# if HAVE_DOTPROD_NATIVE -# define ATTRIBUTES +# ifdef __clang__ +# define ATTRIBUTES _target_attribute("dotprod") + /* + * With gcc, arch=armv8.2-a is needed for dotprod intrinsics, unless the + * default target is armv8.3-a or later in which case it must be omitted. + * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT. + */ +# elif defined(__ARM_FEATURE_JCVT) +# define ATTRIBUTES _target_attribute("+dotprod") # else -# ifdef __clang__ -# define ATTRIBUTES _target_attribute("dotprod") - /* - * With gcc, arch=armv8.2-a is needed for dotprod intrinsics, unless the - * default target is armv8.3-a or later in which case it must be omitted. - * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT. - */ -# elif defined(__ARM_FEATURE_JCVT) -# define ATTRIBUTES _target_attribute("+dotprod") -# else -# define ATTRIBUTES _target_attribute("arch=armv8.2-a+dotprod") -# endif +# define ATTRIBUTES _target_attribute("arch=armv8.2-a+dotprod") # endif # include static ATTRIBUTES u32 diff --git a/lib/arm/cpu_features.h b/lib/arm/cpu_features.h index 9f4b4e77..e2a845c1 100644 --- a/lib/arm/cpu_features.h +++ b/lib/arm/cpu_features.h @@ -115,29 +115,6 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #else # define HAVE_PMULL_INTRIN 0 #endif -/* - * Set USE_PMULL_TARGET_EVEN_IF_NATIVE if a workaround for a gcc bug that was - * fixed by commit 11a113d501ff ("aarch64: Simplify feature definitions") in gcc - * 13 is needed. A minimal program that fails to build due to this bug when - * compiled with -mcpu=emag, at least with gcc 10 through 12, is: - * - * static inline __attribute__((always_inline,target("+crypto"))) void f() {} - * void g() { f(); } - * - * The error is: - * - * error: inlining failed in call to ‘always_inline’ ‘f’: target specific option mismatch - * - * The workaround is to explicitly add the crypto target to the non-inline - * function g(), even though this should not be required due to -mcpu=emag - * enabling 'crypto' natively and causing __ARM_FEATURE_CRYPTO to be defined. - */ -#if HAVE_PMULL_NATIVE && defined(ARCH_ARM64) && \ - GCC_PREREQ(6, 1) && !GCC_PREREQ(13, 1) -# define USE_PMULL_TARGET_EVEN_IF_NATIVE 1 -#else -# define USE_PMULL_TARGET_EVEN_IF_NATIVE 0 -#endif /* CRC32 */ #ifdef __ARM_FEATURE_CRC32 diff --git a/lib/arm/crc32_impl.h b/lib/arm/crc32_impl.h index b23c9353..c4856eda 100644 --- a/lib/arm/crc32_impl.h +++ b/lib/arm/crc32_impl.h @@ -45,9 +45,7 @@ * variable chunk length wouldn't help much, so we just support a fixed length. */ #if HAVE_CRC32_INTRIN -# if HAVE_CRC32_NATIVE -# define ATTRIBUTES -# elif defined(__clang__) +# ifdef __clang__ # define ATTRIBUTES _target_attribute("crc") # else # define ATTRIBUTES _target_attribute("+crc") @@ -226,9 +224,7 @@ crc32_arm_crc(u32 crc, const u8 *p, size_t len) * for implementations that use pmull for folding the data itself. */ #if HAVE_CRC32_INTRIN && HAVE_PMULL_INTRIN -# if HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE -# define ATTRIBUTES -# elif defined(__clang__) +# ifdef __clang__ # define ATTRIBUTES _target_attribute("crc,aes") # else # define ATTRIBUTES _target_attribute("+crc,+crypto") @@ -422,21 +418,19 @@ crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len) #if HAVE_PMULL_INTRIN # define crc32_arm_pmullx4 crc32_arm_pmullx4 # define SUFFIX _pmullx4 -# if HAVE_PMULL_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE -# define ATTRIBUTES -# elif defined(__clang__) +# ifdef __clang__ /* * This used to use "crypto", but that stopped working with clang 16. * Now only "aes" works. "aes" works with older versions too, so use * that. No "+" prefix; clang 15 and earlier doesn't accept that. */ -# define ATTRIBUTES _target_attribute("aes") +# define ATTRIBUTES _target_attribute("aes") # else /* * With gcc, only "+crypto" works. Both the "+" prefix and the * "crypto" (not "aes") are essential... */ -# define ATTRIBUTES _target_attribute("+crypto") +# define ATTRIBUTES _target_attribute("+crypto") # endif # define ENABLE_EOR3 0 # include "crc32_pmull_helpers.h" @@ -545,14 +539,10 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) #if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN # define crc32_arm_pmullx12_crc crc32_arm_pmullx12_crc # define SUFFIX _pmullx12_crc -# if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE -# define ATTRIBUTES +# ifdef __clang__ +# define ATTRIBUTES _target_attribute("aes,crc") # else -# ifdef __clang__ -# define ATTRIBUTES _target_attribute("aes,crc") -# else -# define ATTRIBUTES _target_attribute("+crypto,+crc") -# endif +# define ATTRIBUTES _target_attribute("+crypto,+crc") # endif # define ENABLE_EOR3 0 # include "crc32_pmull_wide.h" @@ -564,29 +554,23 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) * This like crc32_arm_pmullx12_crc(), but it adds the eor3 instruction (from * the sha3 extension) for even better performance. * - * Note: we require HAVE_SHA3_TARGET (or HAVE_SHA3_NATIVE) rather than - * HAVE_SHA3_INTRIN, as we have an inline asm fallback for eor3. + * Note: we require HAVE_SHA3_TARGET rather than HAVE_SHA3_INTRIN, as we have an + * inline asm fallback for eor3. */ -#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && \ - (HAVE_SHA3_TARGET || HAVE_SHA3_NATIVE) +#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_TARGET # define crc32_arm_pmullx12_crc_eor3 crc32_arm_pmullx12_crc_eor3 # define SUFFIX _pmullx12_crc_eor3 -# if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE && \ - !USE_PMULL_TARGET_EVEN_IF_NATIVE -# define ATTRIBUTES +# ifdef __clang__ +# define ATTRIBUTES _target_attribute("aes,crc,sha3") + /* + * With gcc, arch=armv8.2-a is needed for the sha3 intrinsics, unless the + * default target is armv8.3-a or later in which case it must be omitted. + * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT. + */ +# elif defined(__ARM_FEATURE_JCVT) +# define ATTRIBUTES _target_attribute("+crypto,+crc,+sha3") # else -# ifdef __clang__ -# define ATTRIBUTES _target_attribute("aes,crc,sha3") - /* - * With gcc, arch=armv8.2-a is needed for the sha3 intrinsics, unless the - * default target is armv8.3-a or later in which case it must be omitted. - * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT. - */ -# elif defined(__ARM_FEATURE_JCVT) -# define ATTRIBUTES _target_attribute("+crypto,+crc,+sha3") -# else -# define ATTRIBUTES _target_attribute("arch=armv8.2-a+crypto+crc+sha3") -# endif +# define ATTRIBUTES _target_attribute("arch=armv8.2-a+crypto+crc+sha3") # endif # define ENABLE_EOR3 1 # include "crc32_pmull_wide.h" From 474464163598f5da3c20cd90a83e6c8150ff75cc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 17 Mar 2024 11:45:30 -0700 Subject: [PATCH 05/10] lib/arm: fix arm64 builds with -march=armv8-a+nosimd With MSVC it's necessary to assume that arm64 means NEON is available, but this logic should not be applied generally because gcc and recent versions of clang support arm64 without NEON. --- lib/arm/cpu_features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/arm/cpu_features.h b/lib/arm/cpu_features.h index e2a845c1..a425b93c 100644 --- a/lib/arm/cpu_features.h +++ b/lib/arm/cpu_features.h @@ -78,7 +78,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #endif /* !HAVE_DYNAMIC_ARM_CPU_FEATURES */ /* NEON */ -#if defined(__ARM_NEON) || defined(ARCH_ARM64) +#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(ARCH_ARM64)) # define HAVE_NEON_NATIVE 1 #else # define HAVE_NEON_NATIVE 0 From 6a6654c523ddbdfed9dd386fcc6552fd24ff9dfa Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 17 Mar 2024 11:45:30 -0700 Subject: [PATCH 06/10] lib/arm: centralize the intrinsic header inclusions Include all needed intrinsic headers from lib/arm/cpu_features.h so that includes don't need to be scattered in other places. --- lib/arm/adler32_impl.h | 2 -- lib/arm/cpu_features.h | 11 ++++++++--- lib/arm/crc32_impl.h | 9 --------- lib/arm/crc32_pmull_helpers.h | 2 -- lib/arm/crc32_pmull_wide.h | 5 ----- lib/arm/matchfinder_impl.h | 1 - 6 files changed, 8 insertions(+), 22 deletions(-) diff --git a/lib/arm/adler32_impl.h b/lib/arm/adler32_impl.h index d26f5e13..1834c9c5 100644 --- a/lib/arm/adler32_impl.h +++ b/lib/arm/adler32_impl.h @@ -46,7 +46,6 @@ # else # define ATTRIBUTES _target_attribute("+simd") # endif -# include static ATTRIBUTES MAYBE_UNUSED u32 adler32_arm_neon(u32 adler, const u8 *p, size_t len) { @@ -224,7 +223,6 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len) # else # define ATTRIBUTES _target_attribute("arch=armv8.2-a+dotprod") # endif -# include static ATTRIBUTES u32 adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len) { diff --git a/lib/arm/cpu_features.h b/lib/arm/cpu_features.h index a425b93c..f97fc557 100644 --- a/lib/arm/cpu_features.h +++ b/lib/arm/cpu_features.h @@ -185,16 +185,21 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__) # define __ARM_FEATURE_DOTPROD 1 #endif -#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && defined(__clang__) + +#if HAVE_CRC32_INTRIN && (defined(__GNUC__) || defined(__clang__)) # include +#endif +#if HAVE_NEON_INTRIN +# include +#endif + +#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && defined(__clang__) # undef __ARM_FEATURE_CRC32 #endif #if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__) -# include # undef __ARM_FEATURE_SHA3 #endif #if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__) -# include # undef __ARM_FEATURE_DOTPROD #endif diff --git a/lib/arm/crc32_impl.h b/lib/arm/crc32_impl.h index c4856eda..7118e610 100644 --- a/lib/arm/crc32_impl.h +++ b/lib/arm/crc32_impl.h @@ -51,10 +51,6 @@ # define ATTRIBUTES _target_attribute("+crc") # endif -#ifndef _MSC_VER -# include -#endif - /* * Combine the CRCs for 4 adjacent chunks of length L = CRC32_FIXED_CHUNK_LEN * bytes each by computing: @@ -230,11 +226,6 @@ crc32_arm_crc(u32 crc, const u8 *p, size_t len) # define ATTRIBUTES _target_attribute("+crc,+crypto") # endif -#ifndef _MSC_VER -# include -#endif -#include - /* Do carryless multiplication of two 32-bit values. */ static forceinline ATTRIBUTES u64 clmul_u32(u32 a, u32 b) diff --git a/lib/arm/crc32_pmull_helpers.h b/lib/arm/crc32_pmull_helpers.h index a448b7e7..17de4e23 100644 --- a/lib/arm/crc32_pmull_helpers.h +++ b/lib/arm/crc32_pmull_helpers.h @@ -37,8 +37,6 @@ * Use the eor3 instruction (from the sha3 extension). */ -#include - /* Create a vector with 'a' in the first 4 bytes, and the rest zeroed out. */ #undef u32_to_bytevec static forceinline ATTRIBUTES uint8x16_t diff --git a/lib/arm/crc32_pmull_wide.h b/lib/arm/crc32_pmull_wide.h index 076f0479..300e2829 100644 --- a/lib/arm/crc32_pmull_wide.h +++ b/lib/arm/crc32_pmull_wide.h @@ -45,11 +45,6 @@ * Apple M1 processor is an example of such a CPU. */ -#ifndef _MSC_VER -# include -#endif -#include - #include "crc32_pmull_helpers.h" static ATTRIBUTES u32 diff --git a/lib/arm/matchfinder_impl.h b/lib/arm/matchfinder_impl.h index b20f56a3..79c1dbc9 100644 --- a/lib/arm/matchfinder_impl.h +++ b/lib/arm/matchfinder_impl.h @@ -31,7 +31,6 @@ #include "cpu_features.h" #if HAVE_NEON_NATIVE -# include static forceinline void matchfinder_init_neon(mf_pos_t *data, size_t size) { From 236c9dfbd84f274f25c6a1f5d48f9109249ffbae Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 17 Mar 2024 11:45:30 -0700 Subject: [PATCH 07/10] lib/arm: simplify conditions for detecting intrinsics - Don't check *_NATIVE or HAVE_DYNAMIC_ARM_CPU_FEATURES, since technically these are orthognal to intrinsic support. It's true that when building for an operating system that doesn't have runtime CPU feature detection enabled, there is no use in using intrinsics except when the features are supported natively. But we can still build the code; it just won't be called and will be optimized out as unused. - Don't place conditions like defined(ARCH_ARM64) and !defined(_MSC_VER) on HAVE_SHA3_NATIVE and HAVE_DOTPROD_NATIVE. These conditions are only relevant to intrinsics, not the CPU feature per se. --- lib/arm/cpu_features.h | 65 +++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 36 deletions(-) diff --git a/lib/arm/cpu_features.h b/lib/arm/cpu_features.h index f97fc557..417af7b9 100644 --- a/lib/arm/cpu_features.h +++ b/lib/arm/cpu_features.h @@ -88,8 +88,8 @@ static inline u32 get_arm_cpu_features(void) { return 0; } * NEON enabled already. Exception: with gcc 6.1 and later (r230411 for arm32, * r226563 for arm64), hardware floating point support is sufficient. */ -#if HAVE_NEON_NATIVE || \ - (HAVE_DYNAMIC_ARM_CPU_FEATURES && GCC_PREREQ(6, 1) && defined(__ARM_FP)) +#if (defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)) && \ + (HAVE_NEON_NATIVE || (GCC_PREREQ(6, 1) && defined(__ARM_FP))) # define HAVE_NEON_INTRIN 1 #else # define HAVE_NEON_INTRIN 0 @@ -101,11 +101,10 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #else # define HAVE_PMULL_NATIVE 0 #endif -#if defined(ARCH_ARM64) && \ - (HAVE_PMULL_NATIVE || \ - (HAVE_DYNAMIC_ARM_CPU_FEATURES && \ - (GCC_PREREQ(6, 1) || defined(__clang__) || defined(_MSC_VER)))) -# define HAVE_PMULL_INTRIN CPU_IS_LITTLE_ENDIAN() /* untested on big endian */ +#if defined(ARCH_ARM64) && HAVE_NEON_INTRIN && \ + (GCC_PREREQ(6, 1) || defined(__clang__) || defined(_MSC_VER)) && \ + CPU_IS_LITTLE_ENDIAN() /* untested on big endian */ +# define HAVE_PMULL_INTRIN 1 /* Work around MSVC's vmull_p64() taking poly64x1_t instead of poly64_t */ # ifdef _MSC_VER # define compat_vmull_p64(a, b) vmull_p64(vcreate_p64(a), vcreate_p64(b)) @@ -122,50 +121,44 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #else # define HAVE_CRC32_NATIVE 0 #endif -#if defined(ARCH_ARM64) && (HAVE_CRC32_NATIVE || defined(__GNUC__) || \ - defined(__clang__) || defined(_MSC_VER)) +#if defined(ARCH_ARM64) && \ + (defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)) # define HAVE_CRC32_INTRIN 1 #else # define HAVE_CRC32_INTRIN 0 #endif /* SHA3 (needed for the eor3 instruction) */ -#if defined(ARCH_ARM64) && !defined(_MSC_VER) -# ifdef __ARM_FEATURE_SHA3 -# define HAVE_SHA3_NATIVE 1 -# else -# define HAVE_SHA3_NATIVE 0 -# endif -# define HAVE_SHA3_TARGET (HAVE_DYNAMIC_ARM_CPU_FEATURES && \ - (GCC_PREREQ(8, 1) /* r256478 */ || \ - CLANG_PREREQ(7, 0, 10010463) /* r338010 */)) -# define HAVE_SHA3_INTRIN (HAVE_NEON_INTRIN && \ - (HAVE_SHA3_NATIVE || HAVE_SHA3_TARGET) && \ - (GCC_PREREQ(9, 1) /* r268049 */ || \ - CLANG_PREREQ(13, 0, 13160000))) +#ifdef __ARM_FEATURE_SHA3 +# define HAVE_SHA3_NATIVE 1 #else # define HAVE_SHA3_NATIVE 0 +#endif +#if defined(ARCH_ARM64) && \ + (GCC_PREREQ(8, 1) /* r256478 */ || \ + CLANG_PREREQ(7, 0, 10010463) /* r338010 */) +# define HAVE_SHA3_TARGET 1 +#else # define HAVE_SHA3_TARGET 0 +#endif +#if defined(ARCH_ARM64) && HAVE_NEON_INTRIN && \ + (GCC_PREREQ(9, 1) /* r268049 */ || \ + CLANG_PREREQ(13, 0, 13160000)) +# define HAVE_SHA3_INTRIN 1 +#else # define HAVE_SHA3_INTRIN 0 #endif /* dotprod */ -#ifdef ARCH_ARM64 -# ifdef __ARM_FEATURE_DOTPROD -# define HAVE_DOTPROD_NATIVE 1 -# else -# define HAVE_DOTPROD_NATIVE 0 -# endif -# if HAVE_DOTPROD_NATIVE || \ - (HAVE_DYNAMIC_ARM_CPU_FEATURES && \ - (GCC_PREREQ(8, 1) || CLANG_PREREQ(7, 0, 10010000) || \ - defined(_MSC_VER))) -# define HAVE_DOTPROD_INTRIN 1 -# else -# define HAVE_DOTPROD_INTRIN 0 -# endif +#ifdef __ARM_FEATURE_DOTPROD +# define HAVE_DOTPROD_NATIVE 1 #else # define HAVE_DOTPROD_NATIVE 0 +#endif +#if defined(ARCH_ARM64) && HAVE_NEON_INTRIN && \ + (GCC_PREREQ(8, 1) || CLANG_PREREQ(7, 0, 10010000) || defined(_MSC_VER)) +# define HAVE_DOTPROD_INTRIN 1 +#else # define HAVE_DOTPROD_INTRIN 0 #endif From 3b766cd46d2cd33ca728e563e80f36acfcad0ca7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 17 Mar 2024 11:45:30 -0700 Subject: [PATCH 08/10] lib/arm: use asm fallback when clang intrinsics unusable Instead of manually defining macros like __ARM_FEATURE_CRC32 to get the intrinsic headers of clang 15 and earlier to work, just use inline assembly. This should be a better solution as it does not rely on clang implementation details as much. We already used an inline assembly fallback for veor3q_u8 with gcc 8, and with clang 7 through 12. This commit extends the same pattern to the crc32 and dotprod intrinsics, and extends the version range to clang 15. It also drops gcc 8 from the veor3q_u8 fallback, as that is just a single major version and not worth enabling the fallback for. --- lib/arm/cpu_features.h | 107 +++++++++++++++++++++------------- lib/arm/crc32_impl.h | 5 +- lib/arm/crc32_pmull_helpers.h | 10 +--- 3 files changed, 67 insertions(+), 55 deletions(-) diff --git a/lib/arm/cpu_features.h b/lib/arm/cpu_features.h index 417af7b9..67c24374 100644 --- a/lib/arm/cpu_features.h +++ b/lib/arm/cpu_features.h @@ -91,6 +91,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #if (defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)) && \ (HAVE_NEON_NATIVE || (GCC_PREREQ(6, 1) && defined(__ARM_FP))) # define HAVE_NEON_INTRIN 1 +# include #else # define HAVE_NEON_INTRIN 0 #endif @@ -124,6 +125,41 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #if defined(ARCH_ARM64) && \ (defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)) # define HAVE_CRC32_INTRIN 1 +# if defined(__GNUC__) || defined(__clang__) +# include +# endif + /* + * Use an inline assembly fallback for clang 15 and earlier, which only + * defined the crc32 intrinsics when crc32 is enabled in the main target. + */ +# if defined(__clang__) && !CLANG_PREREQ(16, 0, 16000000) && \ + !defined(__ARM_FEATURE_CRC32) +# undef __crc32b +# define __crc32b(a, b) \ + ({ uint32_t res; \ + __asm__("crc32b %w0, %w1, %w2" \ + : "=r" (res) : "r" (a), "r" (b)); \ + res; }) +# undef __crc32h +# define __crc32h(a, b) \ + ({ uint32_t res; \ + __asm__("crc32h %w0, %w1, %w2" \ + : "=r" (res) : "r" (a), "r" (b)); \ + res; }) +# undef __crc32w +# define __crc32w(a, b) \ + ({ uint32_t res; \ + __asm__("crc32w %w0, %w1, %w2" \ + : "=r" (res) : "r" (a), "r" (b)); \ + res; }) +# undef __crc32d +# define __crc32d(a, b) \ + ({ uint32_t res; \ + __asm__("crc32x %w0, %w1, %2" \ + : "=r" (res) : "r" (a), "r" (b)); \ + res; }) +# pragma clang diagnostic ignored "-Wgnu-statement-expression" +# endif #else # define HAVE_CRC32_INTRIN 0 #endif @@ -134,17 +170,24 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #else # define HAVE_SHA3_NATIVE 0 #endif -#if defined(ARCH_ARM64) && \ - (GCC_PREREQ(8, 1) /* r256478 */ || \ - CLANG_PREREQ(7, 0, 10010463) /* r338010 */) -# define HAVE_SHA3_TARGET 1 -#else -# define HAVE_SHA3_TARGET 0 -#endif #if defined(ARCH_ARM64) && HAVE_NEON_INTRIN && \ (GCC_PREREQ(9, 1) /* r268049 */ || \ - CLANG_PREREQ(13, 0, 13160000)) + CLANG_PREREQ(7, 0, 10010463) /* r338010 */) # define HAVE_SHA3_INTRIN 1 + /* + * Use an inline assembly fallback for clang 15 and earlier, which only + * defined the sha3 intrinsics when sha3 is enabled in the main target. + */ +# if defined(__clang__) && !CLANG_PREREQ(16, 0, 16000000) && \ + !defined(__ARM_FEATURE_SHA3) +# undef veor3q_u8 +# define veor3q_u8(a, b, c) \ + ({ uint8x16_t res; \ + __asm__("eor3 %0.16b, %1.16b, %2.16b, %3.16b" \ + : "=w" (res) : "w" (a), "w" (b), "w" (c)); \ + res; }) +# pragma clang diagnostic ignored "-Wgnu-statement-expression" +# endif #else # define HAVE_SHA3_INTRIN 0 #endif @@ -158,44 +201,24 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #if defined(ARCH_ARM64) && HAVE_NEON_INTRIN && \ (GCC_PREREQ(8, 1) || CLANG_PREREQ(7, 0, 10010000) || defined(_MSC_VER)) # define HAVE_DOTPROD_INTRIN 1 + /* + * Use an inline assembly fallback for clang 15 and earlier, which only + * defined the dotprod intrinsics when dotprod is enabled in the main target. + */ +# if defined(__clang__) && !CLANG_PREREQ(16, 0, 16000000) && \ + !defined(__ARM_FEATURE_DOTPROD) +# undef vdotq_u32 +# define vdotq_u32(a, b, c) \ + ({ uint32x4_t res = (a); \ + __asm__("udot %0.4s, %1.16b, %2.16b" \ + : "+w" (res) : "w" (b), "w" (c)); \ + res; }) +# pragma clang diagnostic ignored "-Wgnu-statement-expression" +# endif #else # define HAVE_DOTPROD_INTRIN 0 #endif -/* - * Work around bugs in arm_acle.h and arm_neon.h where sometimes intrinsics are - * only defined when the corresponding __ARM_FEATURE_* macro is defined. The - * intrinsics actually work in target attribute functions too if they are - * defined, though, so work around this by temporarily defining the - * corresponding __ARM_FEATURE_* macros while including the headers. - */ -#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && defined(__clang__) -# define __ARM_FEATURE_CRC32 1 -#endif -#if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__) -# define __ARM_FEATURE_SHA3 1 -#endif -#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__) -# define __ARM_FEATURE_DOTPROD 1 -#endif - -#if HAVE_CRC32_INTRIN && (defined(__GNUC__) || defined(__clang__)) -# include -#endif -#if HAVE_NEON_INTRIN -# include -#endif - -#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && defined(__clang__) -# undef __ARM_FEATURE_CRC32 -#endif -#if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__) -# undef __ARM_FEATURE_SHA3 -#endif -#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__) -# undef __ARM_FEATURE_DOTPROD -#endif - #endif /* ARCH_ARM32 || ARCH_ARM64 */ #endif /* LIB_ARM_CPU_FEATURES_H */ diff --git a/lib/arm/crc32_impl.h b/lib/arm/crc32_impl.h index 7118e610..3c4bec72 100644 --- a/lib/arm/crc32_impl.h +++ b/lib/arm/crc32_impl.h @@ -544,11 +544,8 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) * * This like crc32_arm_pmullx12_crc(), but it adds the eor3 instruction (from * the sha3 extension) for even better performance. - * - * Note: we require HAVE_SHA3_TARGET rather than HAVE_SHA3_INTRIN, as we have an - * inline asm fallback for eor3. */ -#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_TARGET +#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_INTRIN # define crc32_arm_pmullx12_crc_eor3 crc32_arm_pmullx12_crc_eor3 # define SUFFIX _pmullx12_crc_eor3 # ifdef __clang__ diff --git a/lib/arm/crc32_pmull_helpers.h b/lib/arm/crc32_pmull_helpers.h index 17de4e23..023b9044 100644 --- a/lib/arm/crc32_pmull_helpers.h +++ b/lib/arm/crc32_pmull_helpers.h @@ -91,18 +91,10 @@ static forceinline ATTRIBUTES uint8x16_t ADD_SUFFIX(eor3)(uint8x16_t a, uint8x16_t b, uint8x16_t c) { #if ENABLE_EOR3 -#if HAVE_SHA3_INTRIN return veor3q_u8(a, b, c); #else - uint8x16_t res; - - __asm__("eor3 %0.16b, %1.16b, %2.16b, %3.16b" - : "=w" (res) : "w" (a), "w" (b), "w" (c)); - return res; -#endif -#else /* ENABLE_EOR3 */ return veorq_u8(veorq_u8(a, b), c); -#endif /* !ENABLE_EOR3 */ +#endif } #define eor3 ADD_SUFFIX(eor3) From 935b2e26d495ca451d562ff62fac1bf99507b9ed Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 17 Mar 2024 11:45:30 -0700 Subject: [PATCH 09/10] lib/arm: remove unnecessary NATIVE macros Since most of the uses of the HAVE_*_NATIVE macros have been removed, and most of them provide no additional value over the original compiler-provided macro like __ARM_FEATURE_CRC32 anyway, there's not much point in having them anymore. Remove them, except for HAVE_NEON_NATIVE which is still worthwhile to have. --- lib/arm/adler32_impl.h | 2 +- lib/arm/cpu_features.h | 24 ++++++++++-------------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/lib/arm/adler32_impl.h b/lib/arm/adler32_impl.h index 1834c9c5..6453b8e6 100644 --- a/lib/arm/adler32_impl.h +++ b/lib/arm/adler32_impl.h @@ -332,7 +332,7 @@ adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len) #undef ATTRIBUTES #endif /* NEON+dotprod implementation */ -#if defined(adler32_arm_neon_dotprod) && HAVE_DOTPROD_NATIVE +#if defined(adler32_arm_neon_dotprod) && defined(__ARM_FEATURE_DOTPROD) #define DEFAULT_IMPL adler32_arm_neon_dotprod #else static inline adler32_func_t diff --git a/lib/arm/cpu_features.h b/lib/arm/cpu_features.h index 67c24374..9ff85d5a 100644 --- a/lib/arm/cpu_features.h +++ b/lib/arm/cpu_features.h @@ -55,12 +55,6 @@ #define ARM_CPU_FEATURE_SHA3 (1 << 4) #define ARM_CPU_FEATURE_DOTPROD (1 << 5) -#define HAVE_NEON(features) (HAVE_NEON_NATIVE || ((features) & ARM_CPU_FEATURE_NEON)) -#define HAVE_PMULL(features) (HAVE_PMULL_NATIVE || ((features) & ARM_CPU_FEATURE_PMULL)) -#define HAVE_CRC32(features) (HAVE_CRC32_NATIVE || ((features) & ARM_CPU_FEATURE_CRC32)) -#define HAVE_SHA3(features) (HAVE_SHA3_NATIVE || ((features) & ARM_CPU_FEATURE_SHA3)) -#define HAVE_DOTPROD(features) (HAVE_DOTPROD_NATIVE || ((features) & ARM_CPU_FEATURE_DOTPROD)) - #if HAVE_DYNAMIC_ARM_CPU_FEATURES #define ARM_CPU_FEATURES_KNOWN (1U << 31) extern volatile u32 libdeflate_arm_cpu_features; @@ -79,8 +73,10 @@ static inline u32 get_arm_cpu_features(void) { return 0; } /* NEON */ #if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(ARCH_ARM64)) +# define HAVE_NEON(features) 1 # define HAVE_NEON_NATIVE 1 #else +# define HAVE_NEON(features) ((features) & ARM_CPU_FEATURE_NEON) # define HAVE_NEON_NATIVE 0 #endif /* @@ -98,9 +94,9 @@ static inline u32 get_arm_cpu_features(void) { return 0; } /* PMULL */ #ifdef __ARM_FEATURE_CRYPTO -# define HAVE_PMULL_NATIVE 1 +# define HAVE_PMULL(features) 1 #else -# define HAVE_PMULL_NATIVE 0 +# define HAVE_PMULL(features) ((features) & ARM_CPU_FEATURE_PMULL) #endif #if defined(ARCH_ARM64) && HAVE_NEON_INTRIN && \ (GCC_PREREQ(6, 1) || defined(__clang__) || defined(_MSC_VER)) && \ @@ -118,9 +114,9 @@ static inline u32 get_arm_cpu_features(void) { return 0; } /* CRC32 */ #ifdef __ARM_FEATURE_CRC32 -# define HAVE_CRC32_NATIVE 1 +# define HAVE_CRC32(features) 1 #else -# define HAVE_CRC32_NATIVE 0 +# define HAVE_CRC32(features) ((features) & ARM_CPU_FEATURE_CRC32) #endif #if defined(ARCH_ARM64) && \ (defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)) @@ -166,9 +162,9 @@ static inline u32 get_arm_cpu_features(void) { return 0; } /* SHA3 (needed for the eor3 instruction) */ #ifdef __ARM_FEATURE_SHA3 -# define HAVE_SHA3_NATIVE 1 +# define HAVE_SHA3(features) 1 #else -# define HAVE_SHA3_NATIVE 0 +# define HAVE_SHA3(features) ((features) & ARM_CPU_FEATURE_SHA3) #endif #if defined(ARCH_ARM64) && HAVE_NEON_INTRIN && \ (GCC_PREREQ(9, 1) /* r268049 */ || \ @@ -194,9 +190,9 @@ static inline u32 get_arm_cpu_features(void) { return 0; } /* dotprod */ #ifdef __ARM_FEATURE_DOTPROD -# define HAVE_DOTPROD_NATIVE 1 +# define HAVE_DOTPROD(features) 1 #else -# define HAVE_DOTPROD_NATIVE 0 +# define HAVE_DOTPROD(features) ((features) & ARM_CPU_FEATURE_DOTPROD) #endif #if defined(ARCH_ARM64) && HAVE_NEON_INTRIN && \ (GCC_PREREQ(8, 1) || CLANG_PREREQ(7, 0, 10010000) || defined(_MSC_VER)) From 45a5de7b044445a2c843c5051f7f9b800959c927 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 17 Mar 2024 11:45:30 -0700 Subject: [PATCH 10/10] ci.yml: work around ASAN bug on ubuntu-22.04 --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e158e7db..b2aba936 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,6 +17,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y clang llvm libz-dev valgrind + - run: sudo sysctl kernel.randomize_va_space=0 # https://bugs.launchpad.net/ubuntu/+source/llvm-toolchain-14/+bug/2048768 - run: scripts/run_tests.sh - name: Direct compilation without official build system run: $CC -O2 -Wall -Werror lib/*{,/*}.c programs/{gzip,prog_util,tgetopt}.c -o libdeflate-gzip @@ -287,5 +288,6 @@ jobs: sudo apt-get install -y clang llvm - name: Fuzz run: | + sudo sysctl kernel.randomize_va_space=0 # https://bugs.launchpad.net/ubuntu/+source/llvm-toolchain-14/+bug/2048768 scripts/libFuzzer/fuzz.sh --time=120 ${{matrix.sanitizer}} \ ${{matrix.target}}