From 56f9ad8c265f496ac5ddfa2368155d372f684595 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 27 Oct 2024 09:40:28 -0700
Subject: [PATCH] More efficient final reduction for CRC32

---
 lib/arm/crc32_impl.h            |  37 ++++------
 lib/crc32_multipliers.h         |   4 +-
 lib/x86/crc32_pclmul_template.h | 117 +++++++++++++++++---------------
 scripts/gen_crc32_multipliers.c |  40 +++++------
 4 files changed, 97 insertions(+), 101 deletions(-)

diff --git a/lib/arm/crc32_impl.h b/lib/arm/crc32_impl.h
index e33ba680..1e9f0f25 100644
--- a/lib/arm/crc32_impl.h
+++ b/lib/arm/crc32_impl.h
@@ -434,13 +434,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
 		{ CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
 		{ CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
 	};
-	static const u64 _aligned_attribute(16) final_mults[3][2] = {
-		{ CRC32_X63_MODG, 0 },
-		{ CRC32_BARRETT_CONSTANT_1, 0 },
-		{ CRC32_BARRETT_CONSTANT_2, 0 },
+	static const u64 _aligned_attribute(16) barrett_consts[2][2] = {
+		{ CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_1 },
+		{ CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_2 },
+	};
+	static const u32 _aligned_attribute(16) mask32[4] = {
+		0, 0, 0xffffffff, 0
 	};
-	const uint8x16_t zeroes = vdupq_n_u8(0);
-	const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF));
 	const poly64x2_t multipliers_1 = load_multipliers(mults[0]);
 	uint8x16_t v0, v1, v2, v3;
 
@@ -497,24 +497,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
 	if (len)
 		v0 = fold_partial_vec(v0, p, len, multipliers_1);
 
-	/*
-	 * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
-	 * which is equivalent to multiplying by x^32.  This is needed because
-	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
-	 */
-
-	v0 = veorq_u8(vextq_u8(v0, zeroes, 8),
-		      clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1));
-
-	/* Fold 96 => 64 bits. */
-	v0 = veorq_u8(vextq_u8(v0, zeroes, 4),
-		      clmul_low(vandq_u8(v0, mask32),
-				load_multipliers(final_mults[0])));
-
-	/* Reduce 64 => 32 bits using Barrett reduction. */
-	v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1]));
-	v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2]));
-	return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1);
+	/* Reduce to 32 bits, following lib/x86/crc32_pclmul_template.h */
+	v1 = clmul_low(v0, load_multipliers(barrett_consts[0]));
+	v1 = clmul_low(v1, load_multipliers(barrett_consts[1]));
+	v0 = veorq_u8(v0, vandq_u8(v1, vreinterpretq_u8_u32(vld1q_u32(mask32))));
+	v0 = clmul_high(v0, load_multipliers(barrett_consts[0]));
+	v0 = clmul_low(v0, load_multipliers(barrett_consts[1]));
+	return vgetq_lane_u32(vreinterpretq_u32_u8(v0), 2);
 }
 #undef SUFFIX
 #undef ATTRIBUTES
diff --git a/lib/crc32_multipliers.h b/lib/crc32_multipliers.h
index 65a9bf3e..4fdb7bf3 100644
--- a/lib/crc32_multipliers.h
+++ b/lib/crc32_multipliers.h
@@ -100,10 +100,8 @@
 #define CRC32_X4127_MODG 0x1072db28 /* x^4127 mod G(x) */
 #define CRC32_X4063_MODG 0x0c30f51d /* x^4063 mod G(x) */
 
-#define CRC32_X63_MODG 0xb8bc6765 /* x^63 mod G(x) */
-#define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */
+#define CRC32_BARRETT_CONSTANT_1 0xb4e5b025f7011641ULL /* floor(x^95 / G(x)) */
 #define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */
-#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }
 
 #define CRC32_NUM_CHUNKS 4
 #define CRC32_MIN_VARIABLE_CHUNK_LEN 128UL
diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h
index f8e0b2a7..f818a9b2 100644
--- a/lib/x86/crc32_pclmul_template.h
+++ b/lib/x86/crc32_pclmul_template.h
@@ -51,7 +51,10 @@
  * instructions.  Note that the x86 crc32 instruction cannot be used, as it is
  * for a different polynomial, not the gzip one.  For an explanation of CRC
  * folding with carryless multiplication instructions, see
- * scripts/gen_crc32_multipliers.c and the following paper:
+ * scripts/gen_crc32_multipliers.c and the following blog posts and papers:
+ *
+ *	"An alternative exposition of crc32_4k_pclmulqdq"
+ *	https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
  *
  *	"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  *	https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
@@ -193,10 +196,9 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 	const vec_t mults_2v = MULTS_2V;
 	const vec_t mults_1v = MULTS_1V;
 	const __m128i mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
-	const __m128i final_mult = _mm_set_epi64x(0, CRC32_X63_MODG);
-	const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
 	const __m128i barrett_reduction_constants =
 		_mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1);
+	const __m128i mask32 = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0);
 	vec_t v0, v1, v2, v3, v4, v5, v6, v7;
 	__m128i x0 = _mm_cvtsi32_si128(crc);
 	__m128i x1;
@@ -389,68 +391,73 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 #if USE_AVX512
 reduce_x0:
 #endif
-
-	/*
-	 * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
-	 * which is equivalent to multiplying by x^32.  This is needed because
-	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
-	 */
-	x0 = _mm_xor_si128(_mm_srli_si128(x0, 8),
-			   _mm_clmulepi64_si128(x0, mults_128b, 0x10));
-
-	/* Fold 96 => 64 bits. */
-	x0 = _mm_xor_si128(_mm_srli_si128(x0, 4),
-			   _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
-						final_mult, 0x00));
-
 	/*
-	 * Reduce 64 => 32 bits using Barrett reduction.
-	 *
-	 * Let M(x) = A(x)*x^32 + B(x) be the remaining message.  The goal is to
-	 * compute R(x) = M(x) mod G(x).  Since degree(B(x)) < degree(G(x)):
-	 *
-	 *	R(x) = (A(x)*x^32 + B(x)) mod G(x)
-	 *	     = (A(x)*x^32) mod G(x) + B(x)
-	 *
-	 * Then, by the Division Algorithm there exists a unique q(x) such that:
-	 *
-	 *	A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
-	 *
-	 * Since the left-hand side is of maximum degree 31, the right-hand side
-	 * must be too.  This implies that we can apply 'mod x^32' to the
-	 * right-hand side without changing its value:
+	 * Generate the final CRC32 from the 128-bit value A = x0 as follows:
 	 *
-	 *	(A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
+	 *	crc = x^32 * A mod G
+	 *	    = x^32 * (x^64*A_L + A_H) mod G
+	 *	    = x^32 * ((x^32*(x^32*A_L mod G)) + A_H) mod G
 	 *
-	 * Note that '+' is equivalent to '-' in polynomials over GF(2).
+	 * I.e.:
+	 *	crc := 0
+	 *	crc := x^32 * (x^32*crc + A_L) mod G
+	 *	crc := x^32 * (x^32*crc + A_H) mod G
 	 *
-	 * We also know that:
-	 *
-	 *	              / A(x)*x^32 \
-	 *	q(x) = floor (  ---------  )
-	 *	              \    G(x)   /
+	 * A_L and A_H refer to the physically low and high qwords of A, which
+	 * contain the high and low polynomial coefficients respectively!
+	 */
+
+	/*
+	 * Using Barrett reduction, the first step is:
 	 *
-	 * To compute this efficiently, we can multiply the top and bottom by
-	 * x^32 and move the division by G(x) to the top:
+	 *	crc := x^32*A_L mod G
+	 *	     = ((A_L * (x^(32+n) // G)) // x^n) * G mod x^32  [for n >= 63]
+	 *	     = ((A_L * (x^95 // G)) // x^63) * G mod x^32
 	 *
-	 *	              / A(x) * floor(x^64 / G(x)) \
-	 *	q(x) = floor (  -------------------------  )
-	 *	              \           x^32            /
+	 * The // symbol denotes floored division, producing the unique quotient
+	 * polynomial for the given dividend and divisor.  Two carryless
+	 * multiplications, denoted by *, are needed.  The first is of A_L,
+	 * which is of max degree 63, by the degree-63 constant x^95 // G.  This
+	 * produces a product of max degree 126.  Considering CRC32's reverse
+	 * coefficient order, the low qword of the product contains terms x^63
+	 * through x^126, which become x^0 through x^63 after the division by
+	 * x^63 which happens for free.  Multiplying this value of max degree 63
+	 * by the degree-32 constant G gives a product of max degree 95, equal
+	 * to x^32*A_L + (x^32*A_L mod G).  The desired mod G part is in the
+	 * low-order 32 polynomial coefficients, i.e. the index 2 dword.
 	 *
-	 * Note that floor(x^64 / G(x)) is a constant.
+	 * Note that when choosing n in (A_L * (x^(32+n) // G)) // x^n, only
+	 * n=63 works.  n < 63 would be less than the maximum degree of A_L,
+	 * which would cause insufficient precision to be carried through the
+	 * calculation.  n > 63 would require a carryless multiplication
+	 * instruction that takes an input larger than 64 bits.
+	 */
+	x1 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x00);
+	x1 = _mm_clmulepi64_si128(x1, barrett_reduction_constants, 0x10);
+
+	/*
+	 * The second step, also using Barrett reduction:
 	 *
-	 * So finally we have:
+	 *	crc := x^32 * (x^32*crc + A_H) mod G
+	 *	     = floor(((x^32*crc + A_H) * floor(x^95 / G)) / x^63) * G mod x^32
 	 *
-	 *	                          / A(x) * floor(x^64 / G(x)) \
-	 *	R(x) = B(x) + G(x)*floor (  -------------------------  )
-	 *	                          \           x^32            /
+	 * Same as previous but uses the high qword of A and starts with adding
+	 * x^32*crc to it.  Considering CRC32's reverse-coefficient order, that
+	 * means xor'ing crc into the index 2 dword of A, which is conveniently
+	 * the same dword index that crc is in.  Select the crc dword by and-ing
+	 * with a mask, then do the xor; that is a ternary boolean operation
+	 * a ^ (b & c) which with AVX512 is a vpternlog with immediate 0x78.
 	 */
-	x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
-				  barrett_reduction_constants, 0x00);
-	x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32),
-				  barrett_reduction_constants, 0x10);
-	x0 = _mm_xor_si128(x0, x1);
-	return _mm_extract_epi32(x0, 1);
+#if USE_AVX512
+	x0 = _mm_ternarylogic_epi32(x0, x1, mask32, 0x78);
+#else
+	x0 = _mm_xor_si128(x0, _mm_and_si128(x1, mask32));
+#endif
+	x0 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x01);
+	x0 = _mm_clmulepi64_si128(x0, barrett_reduction_constants, 0x10);
+	/* The resulting crc is in the index 2 dword. */
+
+	return _mm_extract_epi32(x0, 2);
 }
 
 #undef vec_t
diff --git a/scripts/gen_crc32_multipliers.c b/scripts/gen_crc32_multipliers.c
index 1073bfc8..4ec7bcfb 100644
--- a/scripts/gen_crc32_multipliers.c
+++ b/scripts/gen_crc32_multipliers.c
@@ -74,17 +74,28 @@ compute_xD_modG(size_t D)
 	return remainder;
 }
 
-/* Compute floor(x^64 / G(x)) */
+/* Compute floor(x^95 / G(x)) */
 static u64
-compute_x64_div_G(void)
+compute_x95_div_G(void)
 {
+	/* The quotient, max order 95 - 32 = 63. */
 	u64 quotient = 0;
-	u64 dividend = 0x1;
 
-	for (int i = 0; i < 64 - 32 + 1; i++) {
-		if ((dividend >> i) & 1) {
-			quotient |= (u64)1 << i;
-			dividend ^= CRCPOLY_FULL << i;
+	/*
+	 * The x^32 through x^95 terms of the remainder.  This starts at x^95
+	 * and is updated through long division.  At the end only the
+	 * x^0 through x^31 terms will be nonzero, but those are unneeded.
+	 */
+	u64 remainder = 0x1;
+
+	for (int i = 0; i < 64; i++) {
+		/*
+		 * If the x^(95-i) term of remainder is nonzero, add
+		 * x^(63-i) * G(x) to cancel it out.  (G(x) has order 32.)
+		 */
+		if (remainder & (1ULL << i)) {
+			quotient |= 1ULL << i;
+			remainder ^= (u64)CRCPOLY_FULL << i;
 		}
 	}
 
@@ -123,20 +134,11 @@ gen_vec_folding_constants(void)
 		printf("\n");
 	}
 
-	/* Multiplier for final 96 => 64 bit fold */
-	printf("#define CRC32_X63_MODG 0x%08"PRIx32" /* x^63 mod G(x) */\n",
-	       compute_xD_modG(63));
-
-	/*
-	 * Constants for final 64 => 32 bit reduction.  These constants are the
-	 * odd ones out, as this final reduction step can't use the regular CRC
-	 * folding described above.  It uses Barrett reduction instead.
-	 */
-	printf("#define CRC32_BARRETT_CONSTANT_1 0x%016"PRIx64"ULL /* floor(x^64 / G(x)) */\n",
-	       compute_x64_div_G());
+	/* Constants for the final 128 => 32 bit reduction */
+	printf("#define CRC32_BARRETT_CONSTANT_1 0x%016"PRIx64"ULL /* floor(x^95 / G(x)) */\n",
+	       compute_x95_div_G());
 	printf("#define CRC32_BARRETT_CONSTANT_2 0x%016"PRIx64"ULL /* G(x) */\n",
 	       CRCPOLY_FULL);
-	printf("#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }\n");
 }
 
 /* Multipliers for combining the CRCs of separate chunks */