Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix mem utils and crc64 tidy #3045

Merged
merged 7 commits into from
Sep 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion libs/libcommon/include/common/crc64.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once
#include <cstdint>
#include <cstddef>
#include <cstdint>
namespace crc64
{
enum class Mode
Expand Down
32 changes: 24 additions & 8 deletions libs/libcommon/include/common/crc64_arch/crc64_aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,14 @@

namespace crc64::_detail
{

using simd_t = uint8x16_t;

class SIMD
{
public:
using Poly64Pair = std::pair<poly64_t, poly64_t>;

SIMD(uint64_t high, uint64_t low) noexcept;
SIMD(uint64_t high, uint64_t low) noexcept

[[nodiscard]] SIMD fold16(SIMD coeff) const noexcept;

Expand Down Expand Up @@ -47,9 +46,15 @@ class SIMD
[[nodiscard]] poly64_t low64() const noexcept;
};

inline SIMD::SIMD(uint64_t high, uint64_t low) noexcept { _inner = vcombine_u8(vcreate_u8(low), vcreate_u8(high)); }
inline SIMD::SIMD(uint64_t high, uint64_t low) noexcept
{
_inner = vcombine_u8(vcreate_u8(low), vcreate_u8(high));
}

inline SIMD SIMD::bitxor(SIMD that) const noexcept { return SIMD{veorq_u8(_inner, that._inner)}; }
inline SIMD SIMD::bitxor(SIMD that) const noexcept
{
return SIMD{veorq_u8(_inner, that._inner)};
}

inline SIMD SIMD::fold8(uint64_t coeff) const noexcept
{
Expand Down Expand Up @@ -89,19 +94,30 @@ inline uint64_t SIMD::barrett(uint64_t poly, uint64_t mu) const noexcept
return reduced ^ static_cast<uint64_t>(t1);
}

inline SIMD::SIMD(simd_t inner) noexcept : _inner(inner) {}
inline SIMD::SIMD(simd_t inner) noexcept
: _inner(inner)
{}

inline SIMD & SIMD::operator^=(const SIMD & that) noexcept
{
this->_inner = this->bitxor(that)._inner;
return *this;
}

inline SIMD SIMD::operator^(const SIMD & that) const noexcept { return bitxor(that); }
inline SIMD SIMD::operator^(const SIMD & that) const noexcept
{
return bitxor(that);
}

inline bool SIMD::operator==(const SIMD & that) const noexcept { return ::memcmp(&_inner, &that._inner, 16) == 0; }
inline bool SIMD::operator==(const SIMD & that) const noexcept
{
return ::memcmp(&_inner, &that._inner, 16) == 0;
}

inline SIMD SIMD::aligned(const void * address) noexcept { return SIMD{vld1q_u8(reinterpret_cast<const uint8_t *>(address))}; }
inline SIMD SIMD::aligned(const void * address) noexcept
{
return SIMD{vld1q_u8(reinterpret_cast<const uint8_t *>(address))};
}

inline SIMD SIMD::from_mul(poly64_t a, poly64_t b) noexcept
{
Expand Down
52 changes: 34 additions & 18 deletions libs/libcommon/include/common/crc64_arch/crc64_x86.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@
#ifdef __x86_64__
#include <immintrin.h>

#include <cstddef>
#include <cstdint>
namespace crc64::_detail
{

using simd_t = __m128i;


class SIMD
{
public:
SIMD(uint64_t high, uint64_t low) noexcept;
SIMD(uint64_t high, uint64_t low)
noexcept;

[[nodiscard]] SIMD fold16(SIMD coeff) const noexcept;

Expand All @@ -29,59 +31,73 @@ class SIMD

bool operator==(const SIMD & that) const noexcept;

explicit SIMD(simd_t inner) noexcept;
explicit SIMD(simd_t val) noexcept;

private:
simd_t _inner{};
simd_t inner{};
};

inline SIMD::SIMD(uint64_t high, uint64_t low) noexcept { _inner = _mm_set_epi64x(static_cast<int64_t>(high), static_cast<int64_t>(low)); }
inline SIMD::SIMD(uint64_t high, uint64_t low) noexcept
{
inner = _mm_set_epi64x(static_cast<int64_t>(high), static_cast<int64_t>(low));
}

inline SIMD SIMD::bitxor(SIMD that) const noexcept { return SIMD{_mm_xor_si128(_inner, that._inner)}; }
inline SIMD SIMD::bitxor(SIMD that) const noexcept
{
return SIMD{_mm_xor_si128(inner, that.inner)};
}

inline SIMD SIMD::fold8(uint64_t coeff) const noexcept
{
auto tmp = SIMD{0, coeff};
auto h = SIMD{_mm_clmulepi64_si128(_inner, tmp._inner, 0x00)};
auto l = SIMD{_mm_srli_si128(_inner, 8)};
auto h = SIMD{_mm_clmulepi64_si128(inner, tmp.inner, 0x00)};
auto l = SIMD{_mm_srli_si128(inner, 8)};
return h.bitxor(l);
}

inline SIMD SIMD::fold16(SIMD coeff) const noexcept
{
auto h = SIMD{_mm_clmulepi64_si128(_inner, coeff._inner, 0x11)};
auto l = SIMD{_mm_clmulepi64_si128(_inner, coeff._inner, 0x00)};
auto h = SIMD{_mm_clmulepi64_si128(inner, coeff.inner, 0x11)};
auto l = SIMD{_mm_clmulepi64_si128(inner, coeff.inner, 0x00)};
return h.bitxor(l);
}

inline uint64_t SIMD::barrett(uint64_t poly, uint64_t mu) const noexcept
{
auto polymu = SIMD{poly, mu};
auto t1 = _mm_clmulepi64_si128(_inner, polymu._inner, 0x00);
auto t1 = _mm_clmulepi64_si128(inner, polymu.inner, 0x00);
auto h = SIMD{_mm_slli_si128(t1, 8)};
auto l = SIMD{_mm_clmulepi64_si128(t1, polymu._inner, 0x10)};
auto l = SIMD{_mm_clmulepi64_si128(t1, polymu.inner, 0x10)};
auto reduced = h.bitxor(l).bitxor(*this);
return static_cast<uint64_t>(_mm_extract_epi64(reduced._inner, 1));
return static_cast<uint64_t>(_mm_extract_epi64(reduced.inner, 1));
}

inline SIMD::SIMD(simd_t inner) noexcept : _inner(inner) {}
inline SIMD::SIMD(simd_t val) noexcept
: inner(val)
{}

inline SIMD & SIMD::operator^=(const SIMD & that) noexcept
{
this->_inner = this->bitxor(that)._inner;
this->inner = this->bitxor(that).inner;
return *this;
}

inline SIMD SIMD::operator^(const SIMD & that) const noexcept { return bitxor(that); }
inline SIMD SIMD::operator^(const SIMD & that) const noexcept
{
return bitxor(that);
}

inline bool SIMD::operator==(const SIMD & that) const noexcept
{
auto tmp = _mm_cmpeq_epi8(_inner, that._inner);
auto tmp = _mm_cmpeq_epi8(inner, that.inner);
auto mask = _mm_movemask_epi8(tmp);
return mask == 0xFFFF;
}

inline SIMD SIMD::aligned(const void * address) noexcept { return SIMD{_mm_load_si128(reinterpret_cast<const __m128i *>(address))}; }
inline SIMD SIMD::aligned(const void * address) noexcept
{
return SIMD{_mm_load_si128(reinterpret_cast<const __m128i *>(address))};
}

} // namespace crc64::_detail
#endif
1 change: 0 additions & 1 deletion libs/libcommon/include/common/crc64_fast.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#include <common/crc64_table.h>
namespace crc64::_detail
{

#if defined(TIFLASH_ENABLE_ASIMD_SUPPORT) || __SSE2__
#define TIFLASH_CRC64_HAS_SIMD_SUPPORT
// avx2 and avx512 variants
Expand Down
9 changes: 6 additions & 3 deletions libs/libcommon/include/common/crc64_table.h
Original file line number Diff line number Diff line change
Expand Up @@ -1484,7 +1484,10 @@ static inline const uint64_t TABLE_15[256] = {

// clang-format on

static inline uint64_t update1(uint64_t state, uint8_t x) { return (state >> 8) ^ TABLE_0[x ^ (static_cast<uint8_t>(state))]; }
static inline uint64_t update1(uint64_t state, uint8_t x)
{
return (state >> 8) ^ TABLE_0[x ^ (static_cast<uint8_t>(state))];
}

static inline uint64_t update16(uint64_t state, const std::array<uint8_t, 16> & slice)
{
Expand All @@ -1509,7 +1512,7 @@ static inline uint64_t update16(uint64_t state, const std::array<uint8_t, 16> &
static inline uint64_t update_table(uint64_t state, const void * src, size_t length)
{
auto address = reinterpret_cast<uintptr_t>(src);
auto ptr = reinterpret_cast<const uint8_t *>(src);
const auto * ptr = reinterpret_cast<const uint8_t *>(src);
auto prefix = (-address) & 15;
if (prefix >= length)
{
Expand All @@ -1527,7 +1530,7 @@ static inline uint64_t update_table(uint64_t state, const void * src, size_t len
};
for (size_t i = 0; i < middle; i += 16, ptr += 16)
{
auto aligned = reinterpret_cast<const uint8_t *>(__builtin_assume_aligned(ptr, 16));
const auto * aligned = reinterpret_cast<const uint8_t *>(__builtin_assume_aligned(ptr, 16));
std::array<uint8_t, 16> slice{
aligned[0],
aligned[1],
Expand Down
1 change: 0 additions & 1 deletion libs/libcommon/include/common/mem_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ namespace mem_utils
{
namespace _detail
{

using ConstBytePtr = char const *;

/// @attention one should not use these loop functions directly in the first place,
Expand Down
1 change: 0 additions & 1 deletion libs/libcommon/src/crc64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include <common/simd.h>
namespace crc64
{

Digest::Digest(Mode mode)
{
// clang-format off
Expand Down
7 changes: 5 additions & 2 deletions libs/libcommon/src/crc64_avx2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ uint64_t update_vpclmulqdq_avx2(uint64_t state, const void * src, size_t length)
x[0] = _mm256_xor_si256(x[0], _mm256_set_epi64x(0, 0, 0, static_cast<int64_t>(state)));

auto coeff = _mm256_set_epi64x(
static_cast<int64_t>(K_1023), static_cast<int64_t>(K_1087), static_cast<int64_t>(K_1023), static_cast<int64_t>(K_1087));
static_cast<int64_t>(K_1023),
static_cast<int64_t>(K_1087),
static_cast<int64_t>(K_1023),
static_cast<int64_t>(K_1087));

auto fold = [](avx256_t a, avx256_t b) -> avx256_t {
auto h = _mm256_clmulepi64_epi128(a, b, 0x11);
Expand Down Expand Up @@ -103,7 +106,7 @@ uint64_t update_vpclmulqdq_avx2(uint64_t state, const void * src, size_t length)
SIMD{K_511, K_575}, // fold by distance of 64 bytes
SIMD{K_383, K_447}, // fold by distance of 48 bytes
SIMD{K_255, K_319}, // fold by distance of 32 bytes
SIMD{K_127, K_191} // fold by distance of 16 bytes
SIMD{K_127, K_191} // fold by distance of 16 bytes
};

auto acc = y[7];
Expand Down
20 changes: 11 additions & 9 deletions libs/libcommon/src/crc64_avx512.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,22 @@ uint64_t update_vpclmulqdq_avx512(uint64_t state, const void * src, size_t lengt

const auto * ptr = reinterpret_cast<const avx512_t *>(__builtin_assume_aligned(src, 512));

auto load_slice = [](const avx512_t * address) -> Slice<2> { return {_mm512_load_si512(address), _mm512_load_si512(address + 1)}; };
auto load_slice = [](const avx512_t * address) -> Slice<2> {
return {_mm512_load_si512(address), _mm512_load_si512(address + 1)};
};

auto x = load_slice(ptr);
ptr += 2;
x[0] = _mm512_xor_si512(x[0], _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, static_cast<int64_t>(state)));

auto coeff = _mm512_set_epi64(static_cast<int64_t>(K_1023),
static_cast<int64_t>(K_1087),
static_cast<int64_t>(K_1023),
static_cast<int64_t>(K_1087),
static_cast<int64_t>(K_1023),
static_cast<int64_t>(K_1087),
static_cast<int64_t>(K_1023),
static_cast<int64_t>(K_1087));
static_cast<int64_t>(K_1087),
static_cast<int64_t>(K_1023),
static_cast<int64_t>(K_1087),
static_cast<int64_t>(K_1023),
static_cast<int64_t>(K_1087),
static_cast<int64_t>(K_1023),
static_cast<int64_t>(K_1087));

auto fold = [](avx512_t a, avx512_t b) -> avx512_t {
auto h = _mm512_clmulepi64_epi128(a, b, 0x11);
Expand Down Expand Up @@ -97,7 +99,7 @@ uint64_t update_vpclmulqdq_avx512(uint64_t state, const void * src, size_t lengt
SIMD{K_511, K_575}, // fold by distance of 64 bytes
SIMD{K_383, K_447}, // fold by distance of 48 bytes
SIMD{K_255, K_319}, // fold by distance of 32 bytes
SIMD{K_127, K_191} // fold by distance of 16 bytes
SIMD{K_127, K_191} // fold by distance of 16 bytes
};

auto acc = y[7];
Expand Down
2 changes: 1 addition & 1 deletion libs/libcommon/src/crc64_sse2_asimd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ uint64_t update_simd(uint64_t state, const void * src, size_t length)
SIMD{K_511, K_575}, // fold by distance of 64 bytes
SIMD{K_383, K_447}, // fold by distance of 48 bytes
SIMD{K_255, K_319}, // fold by distance of 32 bytes
SIMD{K_127, K_191} // fold by distance of 16 bytes
SIMD{K_127, K_191} // fold by distance of 16 bytes
};

auto acc = x[7];
Expand Down
Loading