Skip to content

Commit

Permalink
Branchfree bitmask calculation
Browse files Browse the repository at this point in the history
  • Loading branch information
kimwalisch committed Jul 28, 2024
1 parent 2d00ab3 commit 4ffaf46
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 60 deletions.
23 changes: 11 additions & 12 deletions cmake/multiarch_avx512_vpopcnt.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,12 @@ set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}")

check_cxx_source_compiles("
// GCC/Clang function multiversioning for AVX512 is not needed if
// the user compiles with -mavx512f -mavx512vpopcntdq -mbmi2.
// the user compiles with -mavx512f -mavx512vpopcntdq.
// GCC/Clang function multiversioning generally causes a minor
// overhead, hence we disable it if it is not needed.
#if defined(__AVX512F__) && \
defined(__AVX512VPOPCNTDQ__) && \
defined(__BMI2__)
Error: AVX512 BMI2 multiarch not needed!
defined(__AVX512VPOPCNTDQ__)
Error: AVX512 multiarch not needed!
#endif
#include <src/x86/cpuid.cpp>
Expand All @@ -27,8 +26,8 @@ check_cxx_source_compiles("
class Sieve {
public:
uint64_t count_default(uint64_t* array, uint64_t stop_idx);
__attribute__ ((target (\"avx512f,avx512vpopcntdq,bmi2\")))
uint64_t count_avx512_bmi2(uint64_t* array, uint64_t stop_idx);
__attribute__ ((target (\"avx512f,avx512vpopcntdq\")))
uint64_t count_avx512(uint64_t* array, uint64_t stop_idx);
};
uint64_t Sieve::count_default(uint64_t* array, uint64_t stop_idx)
Expand All @@ -39,8 +38,8 @@ check_cxx_source_compiles("
return res;
}
__attribute__ ((target (\"avx512f,avx512vpopcntdq,bmi2\")))
uint64_t Sieve::count_avx512_bmi2(uint64_t* array, uint64_t stop_idx)
__attribute__ ((target (\"avx512f,avx512vpopcntdq\")))
uint64_t Sieve::count_avx512(uint64_t* array, uint64_t stop_idx)
{
uint64_t i = 0;
__m512i vcnt = _mm512_setzero_si512();
Expand All @@ -52,7 +51,7 @@ check_cxx_source_compiles("
vcnt = _mm512_add_epi64(vcnt, vec);
}
__mmask8 mask = (__mmask8) _bzhi_u64(0xff, stop_idx - i);
__mmask8 mask = (__mmask8) (0xff >> (i + 8 - stop_idx));
__m512i vec = _mm512_maskz_loadu_epi64(mask , &array[i]);
vec = _mm512_popcnt_epi64(vec);
vcnt = _mm512_add_epi64(vcnt, vec);
Expand All @@ -65,8 +64,8 @@ check_cxx_source_compiles("
uint64_t cnt = 0;
Sieve sieve;
if (primecount::has_cpuid_avx512_bmi2())
cnt = sieve.count_avx512_bmi2(&array[0], 10);
if (primecount::has_cpuid_avx512_vpopcnt())
cnt = sieve.count_avx512(&array[0], 10);
else
cnt = sieve.count_default(&array[0], 10);
Expand All @@ -75,7 +74,7 @@ check_cxx_source_compiles("
" multiarch_avx512_vpopcnt)

if(multiarch_avx512_vpopcnt)
list(APPEND PRIMECOUNT_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_AVX512_BMI2")
list(APPEND PRIMECOUNT_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_AVX512_VPOPCNT")
endif()

cmake_pop_check_state()
73 changes: 36 additions & 37 deletions include/Sieve.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,19 +50,17 @@

#elif defined(__AVX512F__) && \
defined(__AVX512VPOPCNTDQ__) && \
defined(__BMI2__) && \
!defined(__i386__) /* misses _bzhi_u64() */ && \
__has_include(<immintrin.h>)
#include <immintrin.h>
#define ENABLE_AVX512_BMI2
#define ENABLE_AVX512_VPOPCNT

#elif defined(ENABLE_MULTIARCH_ARM_SVE)
#include <cpu_supports_arm_sve.hpp>
#include <arm_sve.h>
#define ENABLE_DEFAULT

#elif defined(ENABLE_MULTIARCH_AVX512_BMI2)
#include <cpu_supports_avx512_bmi2.hpp>
#elif defined(ENABLE_MULTIARCH_AVX512_VPOPCNT)
#include <cpu_supports_avx512_vpopcnt.hpp>
#include <immintrin.h>
#define ENABLE_DEFAULT
#else
Expand Down Expand Up @@ -129,12 +127,12 @@ class Sieve
{
#if defined(ENABLE_ARM_SVE)
return count_arm_sve(start, stop);
#elif defined(ENABLE_AVX512_BMI2)
return count_avx512_bmi2(start, stop);
#elif defined(ENABLE_AVX512_VPOPCNT)
return count_avx512(start, stop);
#elif defined(ENABLE_MULTIARCH_ARM_SVE)
return cpu_supports_sve ? count_arm_sve(start, stop) : count_default(start, stop);
#elif defined(ENABLE_MULTIARCH_AVX512_BMI2)
return cpu_supports_avx512_bmi2 ? count_avx512_bmi2(start, stop) : count_default(start, stop);
#elif defined(ENABLE_MULTIARCH_AVX512_VPOPCNT)
return cpu_supports_avx512_vpopcnt ? count_avx512(start, stop) : count_default(start, stop);
#else
return count_default(start, stop);
#endif
Expand Down Expand Up @@ -180,18 +178,18 @@ class Sieve

#endif

#if defined(ENABLE_AVX512_BMI2) || \
defined(ENABLE_MULTIARCH_AVX512_BMI2)
#if defined(ENABLE_AVX512_VPOPCNT) || \
defined(ENABLE_MULTIARCH_AVX512_VPOPCNT)

/// Count 1 bits inside [start, stop].
/// The distance [start, stop] is small here < sqrt(segment_size),
/// hence we simply count the number of unsieved elements
/// by linearly iterating over the sieve array.
///
#if defined(ENABLE_MULTIARCH_AVX512_BMI2)
__attribute__ ((target ("avx512f,avx512vpopcntdq,bmi2")))
#if defined(ENABLE_MULTIARCH_AVX512_VPOPCNT)
__attribute__ ((target ("avx512f,avx512vpopcntdq")))
#endif
uint64_t count_avx512_bmi2(uint64_t start, uint64_t stop) const
uint64_t count_avx512(uint64_t start, uint64_t stop) const
{
if (start > stop)
return 0;
Expand All @@ -203,33 +201,34 @@ class Sieve
uint64_t m2 = unset_larger[stop % 240];
const uint64_t* sieve64 = (const uint64_t*) sieve_.data();

if (start_idx == stop_idx)
return popcnt64(sieve64[start_idx] & m1 & m2);
else
// Branchfree bitmask calculation:
// m1 = (start_idx != stop_idx) ? m1 : m1 & m2;
m1 = (m1 * (start_idx != stop_idx)) | ((m1 & m2) * (start_idx == stop_idx));
// m2 = (start_idx != stop_idx) ? m2 : 0;
m2 *= (start_idx != stop_idx);

uint64_t i = start_idx + 1;
uint64_t start_bits = sieve64[start_idx] & m1;
uint64_t stop_bits = sieve64[stop_idx] & m2;
__m512i vec = _mm512_set_epi64(0, 0, 0, 0, 0, 0, stop_bits, start_bits);
__m512i vcnt = _mm512_popcnt_epi64(vec);

// Compute this for loop using AVX512.
// for (i = start_idx + 1; i < stop_idx; i++)
// cnt += popcnt64(sieve64[i]);
//
for (; i + 8 < stop_idx; i += 8)
{
uint64_t i = start_idx + 1;
uint64_t start_bits = sieve64[start_idx] & m1;
uint64_t stop_bits = sieve64[stop_idx] & m2;
__m512i vec = _mm512_set_epi64(0, 0, 0, 0, 0, 0, stop_bits, start_bits);
__m512i vcnt = _mm512_popcnt_epi64(vec);

// Compute this for loop using AVX512.
// for (i = start_idx + 1; i < stop_idx; i++)
// cnt += popcnt64(sieve64[i]);
//
for (; i + 8 < stop_idx; i += 8)
{
vec = _mm512_loadu_epi64(&sieve64[i]);
vec = _mm512_popcnt_epi64(vec);
vcnt = _mm512_add_epi64(vcnt, vec);
}

__mmask8 mask = (__mmask8) _bzhi_u64(0xff, stop_idx - i);
vec = _mm512_maskz_loadu_epi64(mask, &sieve64[i]);
vec = _mm512_loadu_epi64(&sieve64[i]);
vec = _mm512_popcnt_epi64(vec);
vcnt = _mm512_add_epi64(vcnt, vec);
return _mm512_reduce_add_epi64(vcnt);
}

__mmask8 mask = (__mmask8) (0xff >> (i + 8 - stop_idx));
vec = _mm512_maskz_loadu_epi64(mask, &sieve64[i]);
vec = _mm512_popcnt_epi64(vec);
vcnt = _mm512_add_epi64(vcnt, vec);
return _mm512_reduce_add_epi64(vcnt);
}

#elif defined(ENABLE_ARM_SVE) || \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
///
/// @file cpu_supports_avx512_bmi2.hpp
/// @file cpu_supports_avx512_vpopcnt.hpp
/// @brief Detect if the x86 CPU supports AVX512 and BMI2.
///
/// Copyright (C) 2024 Kim Walisch, <kim.walisch@gmail.com>
Expand All @@ -8,19 +8,19 @@
/// file in the top level directory.
///

#ifndef CPU_SUPPORTS_AVX512_BMI2_HPP
#define CPU_SUPPORTS_AVX512_BMI2_HPP
#ifndef CPU_SUPPORTS_AVX512_VPOPCNT_HPP
#define CPU_SUPPORTS_AVX512_VPOPCNT_HPP

namespace primecount {

bool has_cpuid_avx512_bmi2();
bool has_cpuid_avx512_vpopcnt();

} // namespace

namespace {

/// Initialized at startup
const bool cpu_supports_avx512_bmi2 = primecount::has_cpuid_avx512_bmi2();
const bool cpu_supports_avx512_vpopcnt = primecount::has_cpuid_avx512_vpopcnt();

} // namespace

Expand Down
2 changes: 1 addition & 1 deletion scripts/build_clang_multiarch_win_x64.bat
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
:: include all MMX, SSE, POPCNT, BMI, BMI2, AVX, AVX and AVX512 headers.

del /Q ..\src\deleglise-rivat\S2_easy.cpp ..\src\gourdon\AC.cpp
clang++ -I../include -I../lib/primesieve/include -O3 -mpopcnt -fopenmp -Wall -Wextra -pedantic -DNDEBUG -DENABLE_INT128_OPENMP_PATCH -DENABLE_MULTIARCH_AVX512_VBMI2 -DENABLE_MULTIARCH_AVX512_BMI2 ../lib/primesieve/src/*.cpp ../lib/primesieve/src/x86/*.cpp ../src/*.cpp ../src/x86/*.cpp ../src/lmo/*.cpp ../src/deleglise-rivat/*.cpp ../src/gourdon/*.cpp ../src/app/*.cpp -o primecount.exe "C:\Program Files\LLVM\lib\clang\18\lib\windows\clang_rt.builtins-x86_64.lib"
clang++ -I../include -I../lib/primesieve/include -O3 -mpopcnt -fopenmp -Wall -Wextra -pedantic -DNDEBUG -DENABLE_INT128_OPENMP_PATCH -DENABLE_MULTIARCH_AVX512_VBMI2 -DENABLE_MULTIARCH_AVX512_VPOPCNT ../lib/primesieve/src/*.cpp ../lib/primesieve/src/x86/*.cpp ../src/*.cpp ../src/x86/*.cpp ../src/lmo/*.cpp ../src/deleglise-rivat/*.cpp ../src/gourdon/*.cpp ../src/app/*.cpp -o primecount.exe "C:\Program Files\LLVM\lib\clang\18\lib\windows\clang_rt.builtins-x86_64.lib"
git checkout ..\src\deleglise-rivat
git checkout ..\src\gourdon
6 changes: 1 addition & 5 deletions src/x86/cpuid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
// https://en.wikipedia.org/wiki/CPUID

// %ebx bit flags
#define bit_BMI2 (1 << 8)
#define bit_AVX512F (1 << 16)

// %ecx bit flags
Expand Down Expand Up @@ -91,7 +90,7 @@ bool has_cpuid_popcnt()
return (abcd[2] & bit_POPCNT) == bit_POPCNT;
}

bool has_cpuid_avx512_bmi2()
bool has_cpuid_avx512_vpopcnt()
{
int abcd[4];

Expand All @@ -117,9 +116,6 @@ bool has_cpuid_avx512_bmi2()

run_cpuid(7, 0, abcd);

if ((abcd[1] & bit_BMI2) != bit_BMI2)
return false;

// AVX512F, AVX512VPOPCNTDQ
return ((abcd[1] & bit_AVX512F) == bit_AVX512F &&
(abcd[2] & bit_AVX512_VPOPCNTDQ) == bit_AVX512_VPOPCNTDQ);
Expand Down

0 comments on commit 4ffaf46

Please sign in to comment.