Skip to content

Commit

Permalink
More efficient final reduction for CRC32
Browse files Browse the repository at this point in the history
  • Loading branch information
ebiggers committed Oct 27, 2024
1 parent d5070bb commit 68d114b
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 102 deletions.
37 changes: 13 additions & 24 deletions lib/arm/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -434,13 +434,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
{ CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
{ CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
};
static const u64 _aligned_attribute(16) final_mults[3][2] = {
{ CRC32_X63_MODG, 0 },
{ CRC32_BARRETT_CONSTANT_1, 0 },
{ CRC32_BARRETT_CONSTANT_2, 0 },
static const u64 _aligned_attribute(16) barrett_consts[2][2] = {
{ CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_1 },
{ CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_2 },
};
static const u32 _aligned_attribute(16) mask32[4] = {
0, 0, 0xffffffff, 0
};
const uint8x16_t zeroes = vdupq_n_u8(0);
const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF));
const poly64x2_t multipliers_1 = load_multipliers(mults[0]);
uint8x16_t v0, v1, v2, v3;

Expand Down Expand Up @@ -497,24 +497,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
if (len)
v0 = fold_partial_vec(v0, p, len, multipliers_1);

/*
* Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
* which is equivalent to multiplying by x^32. This is needed because
* the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
*/

v0 = veorq_u8(vextq_u8(v0, zeroes, 8),
clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1));

/* Fold 96 => 64 bits. */
v0 = veorq_u8(vextq_u8(v0, zeroes, 4),
clmul_low(vandq_u8(v0, mask32),
load_multipliers(final_mults[0])));

/* Reduce 64 => 32 bits using Barrett reduction. */
v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1]));
v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2]));
return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1);
/* Reduce to 32 bits, following lib/x86/crc32_pclmul_template.h */
v1 = clmul_low(v0, load_multipliers(barrett_consts[0]));
v1 = clmul_low(v1, load_multipliers(barrett_consts[1]));
v0 = veorq_u8(v0, vandq_u8(v1, vreinterpretq_u8_u32(vld1q_u32(mask32))));
v0 = clmul_high(v0, load_multipliers(barrett_consts[0]));
v0 = clmul_low(v0, load_multipliers(barrett_consts[1]));
return vgetq_lane_u32(vreinterpretq_u32_u8(v0), 2);
}
#undef SUFFIX
#undef ATTRIBUTES
Expand Down
4 changes: 1 addition & 3 deletions lib/crc32_multipliers.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,8 @@
#define CRC32_X4127_MODG 0x1072db28 /* x^4127 mod G(x) */
#define CRC32_X4063_MODG 0x0c30f51d /* x^4063 mod G(x) */

#define CRC32_X63_MODG 0xb8bc6765 /* x^63 mod G(x) */
#define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */
#define CRC32_BARRETT_CONSTANT_1 0xb4e5b025f7011641ULL /* floor(x^95 / G(x)) */
#define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */
#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }

#define CRC32_NUM_CHUNKS 4
#define CRC32_MIN_VARIABLE_CHUNK_LEN 128UL
Expand Down
109 changes: 53 additions & 56 deletions lib/x86/crc32_pclmul_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,10 @@
* instructions. Note that the x86 crc32 instruction cannot be used, as it is
* for a different polynomial, not the gzip one. For an explanation of CRC
* folding with carryless multiplication instructions, see
* scripts/gen_crc32_multipliers.c and the following paper:
* scripts/gen_crc32_multipliers.c and the following blog posts and papers:
*
* "An alternative exposition of crc32_4k_pclmulqdq"
* https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
*
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
* https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
Expand Down Expand Up @@ -193,10 +196,9 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
const vec_t mults_2v = MULTS_2V;
const vec_t mults_1v = MULTS_1V;
const __m128i mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
const __m128i final_mult = _mm_set_epi64x(0, CRC32_X63_MODG);
const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
const __m128i barrett_reduction_constants =
_mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1);
const __m128i mask32 = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0);
vec_t v0, v1, v2, v3, v4, v5, v6, v7;
__m128i x0 = _mm_cvtsi32_si128(crc);
__m128i x1;
Expand Down Expand Up @@ -389,68 +391,63 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
#if USE_AVX512
reduce_x0:
#endif

/*
* Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
* which is equivalent to multiplying by x^32. This is needed because
* the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
*/
x0 = _mm_xor_si128(_mm_srli_si128(x0, 8),
_mm_clmulepi64_si128(x0, mults_128b, 0x10));

/* Fold 96 => 64 bits. */
x0 = _mm_xor_si128(_mm_srli_si128(x0, 4),
_mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
final_mult, 0x00));

/*
* Reduce 64 => 32 bits using Barrett reduction.
*
* Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to
* compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)):
*
* R(x) = (A(x)*x^32 + B(x)) mod G(x)
* = (A(x)*x^32) mod G(x) + B(x)
*
* Then, by the Division Algorithm there exists a unique q(x) such that:
*
* A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
*
* Since the left-hand side is of maximum degree 31, the right-hand side
* must be too. This implies that we can apply 'mod x^32' to the
* right-hand side without changing its value:
*
* (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
* Generate the final CRC32 from the 128-bit value A = x0 as follows:
*
* Note that '+' is equivalent to '-' in polynomials over GF(2).
* crc = x^32 * A mod G
* = x^32 * (x^64*A_L + A_H) mod G
* = x^32 * ((x^32*(x^32*A_L mod G)) + A_H) mod G
*
* We also know that:
* I.e.:
* crc := 0
* crc := x^32 * (x^32*crc + A_L) mod G
* crc := x^32 * (x^32*crc + A_H) mod G
*
* / A(x)*x^32 \
* q(x) = floor ( --------- )
* \ G(x) /
*
* To compute this efficiently, we can multiply the top and bottom by
* x^32 and move the division by G(x) to the top:
* Note: A_L and A_H refer to the physically low and high qwords of A,
* which contain the high and low polynomial coefficients respectively!
*/

/*
* Using Barrett reduction, the first step is:
*
* / A(x) * floor(x^64 / G(x)) \
* q(x) = floor ( ------------------------- )
* \ x^32 /
* crc := x^32*A_L mod G
* = floor((A_L * floor(x^95 / G)) / x^63) * G mod x^32
*
* Note that floor(x^64 / G(x)) is a constant.
* Two carryless multiplications are needed. The first is of the 64-bit
* value A_L by the 64-bit constant floor(x^95 / G). This produces a
* 127-bit product. Considering CRC32's reverse-coefficient order, the
* low qword of the product contains terms x^63 through x^126, which
* turn into x^0 through x^63 after the division by x^63 which happens
* for free. Multiplying this 64-bit value by the 33-bit constant G
* gives a 96-bit product. The mod x^32 means take the low-order 32
* polynomial coefficients, i.e. the result is in the index 2 dword.
*/
x1 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x00);
x1 = _mm_clmulepi64_si128(x1, barrett_reduction_constants, 0x10);

/*
* The second step, also using Barrett reduction:
*
* So finally we have:
* crc := x^32 * (x^32*crc + A_H) mod G
* = floor(((x^32*crc + A_H) * floor(x^95 / G)) / x^63) * G mod x^32
*
* / A(x) * floor(x^64 / G(x)) \
* R(x) = B(x) + G(x)*floor ( ------------------------- )
* \ x^32 /
* Same as previous but uses the high qword of A and starts with adding
* x^32*crc to it. Considering CRC32's reverse-coefficient order, that
* means xor'ing crc into the index 2 dword of A, which is conveniently
* the same dword index that crc is in. Select the crc dword by and-ing
* with a mask, then do the xor; that is a ternary boolean operation
* a ^ (b & c) which with AVX512 is a vpternlog with immediate 0x78.
*/
x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
barrett_reduction_constants, 0x00);
x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32),
barrett_reduction_constants, 0x10);
x0 = _mm_xor_si128(x0, x1);
return _mm_extract_epi32(x0, 1);
#if USE_AVX512
x0 = _mm_ternarylogic_epi32(x0, x1, mask32, 0x78);
#else
x0 = _mm_xor_si128(x0, _mm_and_si128(x1, mask32));
#endif
x0 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x01);
x0 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x10);
/* The resulting crc is in the index 2 dword. */

return _mm_extract_epi32(x0, 2);
}

#undef vec_t
Expand Down
40 changes: 21 additions & 19 deletions scripts/gen_crc32_multipliers.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,17 +74,28 @@ compute_xD_modG(size_t D)
return remainder;
}

/* Compute floor(x^64 / G(x)) */
/* Compute floor(x^95 / G(x)) */
static u64
compute_x64_div_G(void)
compute_x95_div_G(void)
{
/* The quotient, max order 95 - 32 = 63. */
u64 quotient = 0;
u64 dividend = 0x1;

for (int i = 0; i < 64 - 32 + 1; i++) {
if ((dividend >> i) & 1) {
quotient |= (u64)1 << i;
dividend ^= CRCPOLY_FULL << i;
/*
* The x^32 through x^95 terms of the remainder. This starts at x^95
* and is updated through long division. At the end only the
* x^0 through x^31 terms will be nonzero, but those are unneeded.
*/
u64 remainder = 0x1;

for (int i = 0; i < 64; i++) {
/*
* If the x^(95-i) term of remainder is nonzero, add
* x^(63-i) * G(x) to cancel it out. (G(x) has order 32.)
*/
if (remainder & (1ULL << i)) {
quotient |= 1ULL << i;
remainder ^= (u64)CRCPOLY_FULL << i;
}
}

Expand Down Expand Up @@ -123,20 +134,11 @@ gen_vec_folding_constants(void)
printf("\n");
}

/* Multiplier for final 96 => 64 bit fold */
printf("#define CRC32_X63_MODG 0x%08"PRIx32" /* x^63 mod G(x) */\n",
compute_xD_modG(63));

/*
* Constants for final 64 => 32 bit reduction. These constants are the
* odd ones out, as this final reduction step can't use the regular CRC
* folding described above. It uses Barrett reduction instead.
*/
printf("#define CRC32_BARRETT_CONSTANT_1 0x%016"PRIx64"ULL /* floor(x^64 / G(x)) */\n",
compute_x64_div_G());
/* Constants for the final 128 => 32 bit reduction */
printf("#define CRC32_BARRETT_CONSTANT_1 0x%016"PRIx64"ULL /* floor(x^95 / G(x)) */\n",
compute_x95_div_G());
printf("#define CRC32_BARRETT_CONSTANT_2 0x%016"PRIx64"ULL /* G(x) */\n",
CRCPOLY_FULL);
printf("#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }\n");
}

/* Multipliers for combining the CRCs of separate chunks */
Expand Down

0 comments on commit 68d114b

Please sign in to comment.