From 56f9ad8c265f496ac5ddfa2368155d372f684595 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 27 Oct 2024 09:40:28 -0700 Subject: [PATCH] More efficient final reduction for CRC32 --- lib/arm/crc32_impl.h | 37 ++++------ lib/crc32_multipliers.h | 4 +- lib/x86/crc32_pclmul_template.h | 117 +++++++++++++++++--------------- scripts/gen_crc32_multipliers.c | 40 +++++------ 4 files changed, 97 insertions(+), 101 deletions(-) diff --git a/lib/arm/crc32_impl.h b/lib/arm/crc32_impl.h index e33ba680..1e9f0f25 100644 --- a/lib/arm/crc32_impl.h +++ b/lib/arm/crc32_impl.h @@ -434,13 +434,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) { CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */ { CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */ }; - static const u64 _aligned_attribute(16) final_mults[3][2] = { - { CRC32_X63_MODG, 0 }, - { CRC32_BARRETT_CONSTANT_1, 0 }, - { CRC32_BARRETT_CONSTANT_2, 0 }, + static const u64 _aligned_attribute(16) barrett_consts[2][2] = { + { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_1 }, + { CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_2 }, + }; + static const u32 _aligned_attribute(16) mask32[4] = { + 0, 0, 0xffffffff, 0 }; - const uint8x16_t zeroes = vdupq_n_u8(0); - const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF)); const poly64x2_t multipliers_1 = load_multipliers(mults[0]); uint8x16_t v0, v1, v2, v3; @@ -497,24 +497,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) if (len) v0 = fold_partial_vec(v0, p, len, multipliers_1); - /* - * Fold 128 => 96 bits. This also implicitly appends 32 zero bits, - * which is equivalent to multiplying by x^32. This is needed because - * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x). - */ - - v0 = veorq_u8(vextq_u8(v0, zeroes, 8), - clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1)); - - /* Fold 96 => 64 bits. */ - v0 = veorq_u8(vextq_u8(v0, zeroes, 4), - clmul_low(vandq_u8(v0, mask32), - load_multipliers(final_mults[0]))); - - /* Reduce 64 => 32 bits using Barrett reduction. */ - v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1])); - v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2])); - return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1); + /* Reduce to 32 bits, following lib/x86/crc32_pclmul_template.h */ + v1 = clmul_low(v0, load_multipliers(barrett_consts[0])); + v1 = clmul_low(v1, load_multipliers(barrett_consts[1])); + v0 = veorq_u8(v0, vandq_u8(v1, vreinterpretq_u8_u32(vld1q_u32(mask32)))); + v0 = clmul_high(v0, load_multipliers(barrett_consts[0])); + v0 = clmul_low(v0, load_multipliers(barrett_consts[1])); + return vgetq_lane_u32(vreinterpretq_u32_u8(v0), 2); } #undef SUFFIX #undef ATTRIBUTES diff --git a/lib/crc32_multipliers.h b/lib/crc32_multipliers.h index 65a9bf3e..4fdb7bf3 100644 --- a/lib/crc32_multipliers.h +++ b/lib/crc32_multipliers.h @@ -100,10 +100,8 @@ #define CRC32_X4127_MODG 0x1072db28 /* x^4127 mod G(x) */ #define CRC32_X4063_MODG 0x0c30f51d /* x^4063 mod G(x) */ -#define CRC32_X63_MODG 0xb8bc6765 /* x^63 mod G(x) */ -#define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */ +#define CRC32_BARRETT_CONSTANT_1 0xb4e5b025f7011641ULL /* floor(x^95 / G(x)) */ #define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */ -#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 } #define CRC32_NUM_CHUNKS 4 #define CRC32_MIN_VARIABLE_CHUNK_LEN 128UL diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h index f8e0b2a7..f818a9b2 100644 --- a/lib/x86/crc32_pclmul_template.h +++ b/lib/x86/crc32_pclmul_template.h @@ -51,7 +51,10 @@ * instructions. Note that the x86 crc32 instruction cannot be used, as it is * for a different polynomial, not the gzip one. For an explanation of CRC * folding with carryless multiplication instructions, see - * scripts/gen_crc32_multipliers.c and the following paper: + * scripts/gen_crc32_multipliers.c and the following blog posts and papers: + * + * "An alternative exposition of crc32_4k_pclmulqdq" + * https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq * * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf @@ -193,10 +196,9 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) const vec_t mults_2v = MULTS_2V; const vec_t mults_1v = MULTS_1V; const __m128i mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG); - const __m128i final_mult = _mm_set_epi64x(0, CRC32_X63_MODG); - const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF); const __m128i barrett_reduction_constants = _mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1); + const __m128i mask32 = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0); vec_t v0, v1, v2, v3, v4, v5, v6, v7; __m128i x0 = _mm_cvtsi32_si128(crc); __m128i x1; @@ -389,68 +391,73 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) #if USE_AVX512 reduce_x0: #endif - - /* - * Fold 128 => 96 bits. This also implicitly appends 32 zero bits, - * which is equivalent to multiplying by x^32. This is needed because - * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x). - */ - x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), - _mm_clmulepi64_si128(x0, mults_128b, 0x10)); - - /* Fold 96 => 64 bits. */ - x0 = _mm_xor_si128(_mm_srli_si128(x0, 4), - _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), - final_mult, 0x00)); - /* - * Reduce 64 => 32 bits using Barrett reduction. - * - * Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to - * compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)): - * - * R(x) = (A(x)*x^32 + B(x)) mod G(x) - * = (A(x)*x^32) mod G(x) + B(x) - * - * Then, by the Division Algorithm there exists a unique q(x) such that: - * - * A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x) - * - * Since the left-hand side is of maximum degree 31, the right-hand side - * must be too. This implies that we can apply 'mod x^32' to the - * right-hand side without changing its value: + * Generate the final CRC32 from the 128-bit value A = x0 as follows: * - * (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32 + * crc = x^32 * A mod G + * = x^32 * (x^64*A_L + A_H) mod G + * = x^32 * ((x^32*(x^32*A_L mod G)) + A_H) mod G * - * Note that '+' is equivalent to '-' in polynomials over GF(2). + * I.e.: + * crc := 0 + * crc := x^32 * (x^32*crc + A_L) mod G + * crc := x^32 * (x^32*crc + A_H) mod G * - * We also know that: - * - * / A(x)*x^32 \ - * q(x) = floor ( --------- ) - * \ G(x) / + * A_L and A_H refer to the physically low and high qwords of A, which + * contain the high and low polynomial coefficients respectively! + */ + + /* + * Using Barrett reduction, the first step is: * - * To compute this efficiently, we can multiply the top and bottom by - * x^32 and move the division by G(x) to the top: + * crc := x^32*A_L mod G + * = ((A_L * (x^(32+n) // G)) // x^n) * G mod x^32 [for n >= 63] + * = ((A_L * (x^95 // G)) // x^63) * G mod x^32 * - * / A(x) * floor(x^64 / G(x)) \ - * q(x) = floor ( ------------------------- ) - * \ x^32 / + * The // symbol denotes floored division, producing the unique quotient + * polynomial for the given dividend and divisor. Two carryless + * multiplications, denoted by *, are needed. The first is of A_L, + * which is of max degree 63, by the degree-63 constant x^95 // G. This + * produces a product of max degree 126. Considering CRC32's reverse + * coefficient order, the low qword of the product contains terms x^63 + * through x^126, which become x^0 through x^63 after the division by + * x^63 which happens for free. Multiplying this value of max degree 63 + * by the degree-32 constant G gives a product of max degree 95, equal + * to x^32*A_L + (x^32*A_L mod G). The desired mod G part is in the + * low-order 32 polynomial coefficients, i.e. the index 2 dword. * - * Note that floor(x^64 / G(x)) is a constant. + * Note that when choosing n in (A_L * (x^(32+n) // G)) // x^n, only + * n=63 works. n < 63 would be less than the maximum degree of A_L, + * which would cause insufficient precision to be carried through the + * calculation. n > 63 would require a carryless multiplication + * instruction that takes an input larger than 64 bits. + */ + x1 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x00); + x1 = _mm_clmulepi64_si128(x1, barrett_reduction_constants, 0x10); + + /* + * The second step, also using Barrett reduction: * - * So finally we have: + * crc := x^32 * (x^32*crc + A_H) mod G + * = floor(((x^32*crc + A_H) * floor(x^95 / G)) / x^63) * G mod x^32 * - * / A(x) * floor(x^64 / G(x)) \ - * R(x) = B(x) + G(x)*floor ( ------------------------- ) - * \ x^32 / + * Same as previous but uses the high qword of A and starts with adding + * x^32*crc to it. Considering CRC32's reverse-coefficient order, that + * means xor'ing crc into the index 2 dword of A, which is conveniently + * the same dword index that crc is in. Select the crc dword by and-ing + * with a mask, then do the xor; that is a ternary boolean operation + * a ^ (b & c) which with AVX512 is a vpternlog with immediate 0x78. */ - x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), - barrett_reduction_constants, 0x00); - x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32), - barrett_reduction_constants, 0x10); - x0 = _mm_xor_si128(x0, x1); - return _mm_extract_epi32(x0, 1); +#if USE_AVX512 + x0 = _mm_ternarylogic_epi32(x0, x1, mask32, 0x78); +#else + x0 = _mm_xor_si128(x0, _mm_and_si128(x1, mask32)); +#endif + x0 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x01); + x0 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x10); + /* The resulting crc is in the index 2 dword. */ + + return _mm_extract_epi32(x0, 2); } #undef vec_t diff --git a/scripts/gen_crc32_multipliers.c b/scripts/gen_crc32_multipliers.c index 1073bfc8..4ec7bcfb 100644 --- a/scripts/gen_crc32_multipliers.c +++ b/scripts/gen_crc32_multipliers.c @@ -74,17 +74,28 @@ compute_xD_modG(size_t D) return remainder; } -/* Compute floor(x^64 / G(x)) */ +/* Compute floor(x^95 / G(x)) */ static u64 -compute_x64_div_G(void) +compute_x95_div_G(void) { + /* The quotient, max order 95 - 32 = 63. */ u64 quotient = 0; - u64 dividend = 0x1; - for (int i = 0; i < 64 - 32 + 1; i++) { - if ((dividend >> i) & 1) { - quotient |= (u64)1 << i; - dividend ^= CRCPOLY_FULL << i; + /* + * The x^32 through x^95 terms of the remainder. This starts at x^95 + * and is updated through long division. At the end only the + * x^0 through x^31 terms will be nonzero, but those are unneeded. + */ + u64 remainder = 0x1; + + for (int i = 0; i < 64; i++) { + /* + * If the x^(95-i) term of remainder is nonzero, add + * x^(63-i) * G(x) to cancel it out. (G(x) has order 32.) + */ + if (remainder & (1ULL << i)) { + quotient |= 1ULL << i; + remainder ^= (u64)CRCPOLY_FULL << i; } } @@ -123,20 +134,11 @@ gen_vec_folding_constants(void) printf("\n"); } - /* Multiplier for final 96 => 64 bit fold */ - printf("#define CRC32_X63_MODG 0x%08"PRIx32" /* x^63 mod G(x) */\n", - compute_xD_modG(63)); - - /* - * Constants for final 64 => 32 bit reduction. These constants are the - * odd ones out, as this final reduction step can't use the regular CRC - * folding described above. It uses Barrett reduction instead. - */ - printf("#define CRC32_BARRETT_CONSTANT_1 0x%016"PRIx64"ULL /* floor(x^64 / G(x)) */\n", - compute_x64_div_G()); + /* Constants for the final 128 => 32 bit reduction */ + printf("#define CRC32_BARRETT_CONSTANT_1 0x%016"PRIx64"ULL /* floor(x^95 / G(x)) */\n", + compute_x95_div_G()); printf("#define CRC32_BARRETT_CONSTANT_2 0x%016"PRIx64"ULL /* G(x) */\n", CRCPOLY_FULL); - printf("#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }\n"); } /* Multipliers for combining the CRCs of separate chunks */