Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARM CPU feature cleanups #355

Merged
merged 10 commits into from
Mar 17, 2024
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ jobs:
run: |
sudo apt-get update
sudo apt-get install -y clang llvm libz-dev valgrind
- run: sudo sysctl kernel.randomize_va_space=0 # https://bugs.launchpad.net/ubuntu/+source/llvm-toolchain-14/+bug/2048768
- run: scripts/run_tests.sh
- name: Direct compilation without official build system
run: $CC -O2 -Wall -Werror lib/*{,/*}.c programs/{gzip,prog_util,tgetopt}.c -o libdeflate-gzip
Expand Down Expand Up @@ -287,5 +288,6 @@ jobs:
sudo apt-get install -y clang llvm
- name: Fuzz
run: |
sudo sysctl kernel.randomize_va_space=0 # https://bugs.launchpad.net/ubuntu/+source/llvm-toolchain-14/+bug/2048768
scripts/libFuzzer/fuzz.sh --time=120 ${{matrix.sanitizer}} \
${{matrix.target}}
44 changes: 21 additions & 23 deletions lib/arm/adler32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,20 @@

/* Regular NEON implementation */
#if HAVE_NEON_INTRIN && CPU_IS_LITTLE_ENDIAN()
# define adler32_arm_neon adler32_arm_neon
# define adler32_arm_neon adler32_arm_neon
# if HAVE_NEON_NATIVE
/*
* Use no attributes if none are needed, to support old versions of clang
* that don't accept the simd target attribute.
*/
# define ATTRIBUTES
# elif defined(ARCH_ARM32)
# define ATTRIBUTES _target_attribute("fpu=neon")
# elif defined(__clang__)
# define ATTRIBUTES _target_attribute("simd")
# else
# ifdef ARCH_ARM32
# define ATTRIBUTES _target_attribute("fpu=neon")
# else
# define ATTRIBUTES _target_attribute("+simd")
# endif
# define ATTRIBUTES _target_attribute("+simd")
# endif
# include <arm_neon.h>
static ATTRIBUTES MAYBE_UNUSED u32
adler32_arm_neon(u32 adler, const u8 *p, size_t len)
{
Expand Down Expand Up @@ -208,23 +211,18 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len)
/* NEON+dotprod implementation */
#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN()
# define adler32_arm_neon_dotprod adler32_arm_neon_dotprod
# if HAVE_DOTPROD_NATIVE
# define ATTRIBUTES
# ifdef __clang__
# define ATTRIBUTES _target_attribute("dotprod")
/*
* With gcc, arch=armv8.2-a is needed for dotprod intrinsics, unless the
* default target is armv8.3-a or later in which case it must be omitted.
* armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT.
*/
# elif defined(__ARM_FEATURE_JCVT)
# define ATTRIBUTES _target_attribute("+dotprod")
# else
# ifdef __clang__
# define ATTRIBUTES _target_attribute("dotprod")
/*
* With gcc, arch=armv8.2-a is needed for dotprod intrinsics, unless the
* default target is armv8.3-a or later in which case it must be omitted.
* armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT.
*/
# elif defined(__ARM_FEATURE_JCVT)
# define ATTRIBUTES _target_attribute("+dotprod")
# else
# define ATTRIBUTES _target_attribute("arch=armv8.2-a+dotprod")
# endif
# define ATTRIBUTES _target_attribute("arch=armv8.2-a+dotprod")
# endif
# include <arm_neon.h>
static ATTRIBUTES u32
adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len)
{
Expand Down Expand Up @@ -334,7 +332,7 @@ adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len)
#undef ATTRIBUTES
#endif /* NEON+dotprod implementation */

#if defined(adler32_arm_neon_dotprod) && HAVE_DOTPROD_NATIVE
#if defined(adler32_arm_neon_dotprod) && defined(__ARM_FEATURE_DOTPROD)
#define DEFAULT_IMPL adler32_arm_neon_dotprod
#else
static inline adler32_func_t
Expand Down
19 changes: 15 additions & 4 deletions lib/arm/cpu_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,6 @@ static u32 query_arm_cpu_features(void)
STATIC_ASSERT(sizeof(long) == 4);
if (hwcap & (1 << 12)) /* HWCAP_NEON */
features |= ARM_CPU_FEATURE_NEON;
if (hwcap2 & (1 << 1)) /* HWCAP2_PMULL */
features |= ARM_CPU_FEATURE_PMULL;
if (hwcap2 & (1 << 4)) /* HWCAP2_CRC32 */
features |= ARM_CPU_FEATURE_CRC32;
#else
STATIC_ASSERT(sizeof(long) == 8);
if (hwcap & (1 << 1)) /* HWCAP_ASIMD */
Expand All @@ -138,6 +134,7 @@ static u32 query_arm_cpu_features(void)

#include <sys/types.h>
#include <sys/sysctl.h>
#include <TargetConditionals.h>

static const struct {
const char *name;
Expand Down Expand Up @@ -192,6 +189,7 @@ static u32 query_arm_cpu_features(void)
static const struct cpu_feature arm_cpu_feature_table[] = {
{ARM_CPU_FEATURE_NEON, "neon"},
{ARM_CPU_FEATURE_PMULL, "pmull"},
{ARM_CPU_FEATURE_PREFER_PMULL, "prefer_pmull"},
{ARM_CPU_FEATURE_CRC32, "crc32"},
{ARM_CPU_FEATURE_SHA3, "sha3"},
{ARM_CPU_FEATURE_DOTPROD, "dotprod"},
Expand All @@ -203,6 +201,19 @@ void libdeflate_init_arm_cpu_features(void)
{
u32 features = query_arm_cpu_features();

/*
* On the Apple M1 processor, crc32 instructions max out at about 25.5
* GB/s in the best case of using a 3-way or greater interleaved chunked
* implementation, whereas a pmull-based implementation achieves 68 GB/s
* provided that the stride length is large enough (about 10+ vectors
* with eor3, or 12+ without).
*
* Assume that crc32 instructions are preferable in other cases.
*/
#if (defined(__APPLE__) && TARGET_OS_OSX) || defined(TEST_SUPPORT__DO_NOT_USE)
features |= ARM_CPU_FEATURE_PREFER_PMULL;
#endif

disable_cpu_features_for_testing(&features, arm_cpu_feature_table,
ARRAY_LEN(arm_cpu_feature_table));

Expand Down
Loading
Loading