#61: encoders: unroll inner loops

The transformation of encoder kernels to inline functions (#58) allows us to move the inner encoding loop into separate inline functions. Because the number of remaining loop iterations is known, we can split calls to the inner loop into long unrolled stretches. Tests show that this can result in significant speedups.
aklomp · Nov 28, 2019 · dd8dc9b · dd8dc9b
2 parents 0024d2b + 8680b68
commit dd8dc9b
Show file tree

Hide file tree

Showing 8 changed files with 342 additions and 135 deletions.
diff --git a/README.md b/README.md
@@ -434,11 +434,11 @@ x86 processors
 | i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4884\*    | 7099\*    | 4917\*    | 7057\*    | 4799\*  | 7143\*  | 4902\*   | 7219\*   |
 | i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 5212\*    | 8849\*    | 5284\*    | 9099\*    | 5289\*  | 9220\*  | 4849\*   | 9200\*   |
 | i7-4870HQ @ 2.5 GHz                       | 1471\*    | 3066\*    | 6721\*    | 6962\*    | 7015\*  | 8267\*  | 8328\*   | 11576\*  |
-| i5-4590S @ 3.0 GHz                        | 1858      | 3090      | 4329      | 5825      | 4305    | 6092    | 4241     | 6174     |
-| Xeon X5570 @ 2.93 GHz                     | 1137      | 1473      | 2536      | 3283      | -       | -       | -        | -        |
+| i5-4590S @ 3.0 GHz                        | 1843      | 3090      | 4363      | 5825      | 4243    | 6092    | 4160     | 6174     |
+| Xeon X5570 @ 2.93 GHz                     | 1151      | 1473      | 3160      | 3283      | -       | -       | -        | -        |
 | Pentium4 @ 3.4 GHz                        | 528\*     | 448\*     | -         | -         | -       | -       | -        | -        |
-| Atom N270                                 | 126       | 233       | 484       | 407       | -       | -       | -        | -        |
-| AMD E-450                                 | 378       | 564       | 614       | 660       | -       | -       | -        | -        |
+| Atom N270                                 | 108       | 233       | 508       | 407       | -       | -       | -        | -        |
+| AMD E-450                                 | 395       | 564       | 625       | 660       | -       | -       | -        | -        |
 | Intel Edison @ 500 MHz                    | 79\*      | 92\*      | 152\*     | 172\*     | -       | -       | -        | -        |
 | Intel Edison @ 500 MHz OPENMP 2 thread    | 158\*     | 184\*     | 300\*     | 343\*     | -       | -       | -        | -        |
 | Intel Edison @ 500 MHz (x86-64)           | 97\*      | 146\*     | 197\*     | 207\*     | -       | -       | -        | -        |
@@ -449,7 +449,7 @@ ARM processors
 | Processor                                 | Plain enc | Plain dec | NEON32 enc | NEON32 dec | NEON64 enc | NEON64 dec |
 |-------------------------------------------|----------:|----------:|-----------:|-----------:|-----------:|-----------:|
 | Raspberry PI B+ V1.2                      | 46\*      | 40\*      | -          | -          | -          | -          |
-| Raspberry PI 2 B V1.1                     | 86        | 84        | 188        | 225        | -          | -          |
+| Raspberry PI 2 B V1.1                     | 87        | 84        | 192        | 225        | -          | -          |
 | Apple iPhone SE armv7                     | 1056\*    | 895\*     | 2943\*     | 2618\*     | -          | -          |
 | Apple iPhone SE arm64                     | 1061\*    | 1239\*    | -          | -          | 4098\*     | 3983\*     |
 

diff --git a/lib/arch/avx2/enc_loop.c b/lib/arch/avx2/enc_loop.c
@@ -1,3 +1,37 @@
+static inline void
+enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
+{
+	// First load is done at s - 0 to not get a segfault:
+	__m256i src = _mm256_loadu_si256((__m256i *) *s);
+
+	// Shift by 4 bytes, as required by enc_reshuffle:
+	src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
+
+	// Reshuffle, translate, store:
+	src = enc_reshuffle(src);
+	src = enc_translate(src);
+	_mm256_storeu_si256((__m256i *) *o, src);
+
+	// Subsequent loads will be done at s - 4, set pointer for next round:
+	*s += 20;
+	*o += 32;
+}
+
+static inline void
+enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
+{
+	// Load input:
+	__m256i src = _mm256_loadu_si256((__m256i *) *s);
+
+	// Reshuffle, translate, store:
+	src = enc_reshuffle(src);
+	src = enc_translate(src);
+	_mm256_storeu_si256((__m256i *) *o, src);
+
+	*s += 24;
+	*o += 32;
+}
+
 static inline void
 enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 {
@@ -14,30 +48,40 @@ enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 	*slen -= rounds * 24;   // 24 bytes consumed per round
 	*olen += rounds * 32;   // 32 bytes produced per round
 
-	// First load is done at s - 0 to not get a segfault:
-	__m256i inputvector = _mm256_loadu_si256((__m256i *) *s);
-
-	// Subsequent loads will be done at s - 4, set pointer for next round:
-	*s += 20;
-
-	// Shift by 4 bytes, as required by enc_reshuffle:
-	inputvector = _mm256_permutevar8x32_epi32(inputvector, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
-
-	for (;;) {
-
-		// Reshuffle, translate, store:
-		inputvector = enc_reshuffle(inputvector);
-		inputvector = enc_translate(inputvector);
-		_mm256_storeu_si256((__m256i *) *o, inputvector);
-		*o += 32;
+	// The first loop iteration requires special handling to ensure that
+	// the read, which is done at an offset, does not underflow the buffer:
+	enc_loop_avx2_inner_first(s, o);
+	rounds--;
 
-		if (--rounds == 0) {
-			break;
+	while (rounds > 0) {
+		if (rounds >= 8) {
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			rounds -= 8;
+			continue;
 		}
-
-		// Load for the next round:
-		inputvector = _mm256_loadu_si256((__m256i *) *s);
-		*s += 24;
+		if (rounds >= 4) {
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_avx2_inner(s, o);
+		break;
 	}
 
 	// Add the offset back:

diff --git a/lib/arch/generic/32/enc_loop.c b/lib/arch/generic/32/enc_loop.c
@@ -1,3 +1,27 @@
+static inline void
+enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o)
+{
+	uint32_t src;
+
+	// Load input:
+	memcpy(&src, *s, sizeof (src));
+
+	// Reorder to 32-bit big-endian, if not already in that format. The
+	// workset must be in big-endian, otherwise the shifted bits do not
+	// carry over properly among adjacent bytes:
+	src = BASE64_HTOBE32(src);
+
+	// Shift input by 6 bytes each round and mask in only the lower 6 bits;
+	// look up the character in the Base64 encoding table and write it to
+	// the output location:
+	*(*o)++ = base64_table_enc[(src >> 26) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 20) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 14) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >>  8) & 0x3F];
+
+	*s += 3;
+}
+
 static inline void
 enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 {
@@ -15,25 +39,34 @@ enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 	*olen += rounds * 4;	// 4 bytes produced per round
 
 	do {
-		uint32_t src;
-
-		// Load input:
-		memcpy(&src, *s, sizeof (src));
-
-		// Reorder to 32-bit big-endian, if not already in that format.
-		// The workset must be in big-endian, otherwise the shifted
-		// bits do not carry over properly among adjacent bytes:
-		src = BASE64_HTOBE32(src);
-
-		// Shift input by 6 bytes each round and mask in only the lower
-		// 6 bits; look up the character in the Base64 encoding table
-		// and write it to the output location:
-		*(*o)++ = base64_table_enc[(src >> 26) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 20) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 14) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >>  8) & 0x3F];
-
-		*s += 3;
+		if (rounds >= 8) {
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_generic_32_inner(s, o);
+		break;
 
-	} while (--rounds > 0);
+	} while (rounds > 0);
 }
diff --git a/lib/arch/generic/64/enc_loop.c b/lib/arch/generic/64/enc_loop.c
@@ -1,3 +1,31 @@
+static inline void
+enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o)
+{
+	uint64_t src;
+
+	// Load input:
+	memcpy(&src, *s, sizeof (src));
+
+	// Reorder to 64-bit big-endian, if not already in that format. The
+	// workset must be in big-endian, otherwise the shifted bits do not
+	// carry over properly among adjacent bytes:
+	src = BASE64_HTOBE64(src);
+
+	// Shift input by 6 bytes each round and mask in only the lower 6 bits;
+	// look up the character in the Base64 encoding table and write it to
+	// the output location:
+	*(*o)++ = base64_table_enc[(src >> 58) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 52) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 46) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 40) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 34) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 28) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 22) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 16) & 0x3F];
+
+	*s += 6;
+}
+
 static inline void
 enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 {
@@ -15,29 +43,34 @@ enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 	*olen += rounds * 8;	// 8 bytes produced per round
 
 	do {
-		uint64_t src;
-
-		// Load input:
-		memcpy(&src, *s, sizeof (src));
-
-		// Reorder to 64-bit big-endian, if not already in that format.
-		// The workset must be in big-endian, otherwise the shifted
-		// bits do not carry over properly among adjacent bytes:
-		src = BASE64_HTOBE64(src);
-
-		// Shift input by 6 bytes each round and mask in only the lower
-		// 6 bits; look up the character in the Base64 encoding table
-		// and write it to the output location:
-		*(*o)++ = base64_table_enc[(src >> 58) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 52) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 46) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 40) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 34) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 28) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 22) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 16) & 0x3F];
-
-		*s += 6;
-
-	} while (--rounds > 0);
+		if (rounds >= 8) {
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_generic_64_inner(s, o);
+		break;
+
+	} while (rounds > 0);
 }
diff --git a/lib/arch/neon32/enc_loop.c b/lib/arch/neon32/enc_loop.c
@@ -1,26 +1,58 @@
 static inline void
-enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
 {
-	size_t rounds = *slen / 48;
+	// Load 48 bytes and deinterleave:
+	uint8x16x3_t src = vld3q_u8(*s);
 
-	*slen -= rounds * 48;	// 48 bytes consumed per round
-	*olen += rounds * 64;	// 64 bytes produced per round
+	// Reshuffle:
+	uint8x16x4_t out = enc_reshuffle(src);
 
-	while (rounds-- > 0) {
+	// Translate reshuffled bytes to the Base64 alphabet:
+	out = enc_translate(out);
 
-		// Load 48 bytes and deinterleave:
-		uint8x16x3_t src = vld3q_u8(*s);
+	// Interleave and store output:
+	vst4q_u8(*o, out);
 
-		// Reshuffle:
-		uint8x16x4_t out = enc_reshuffle(src);
+	*s += 48;
+	*o += 64;
+}
 
-		// Translate reshuffled bytes to the Base64 alphabet:
-		out = enc_translate(out);
+static inline void
+enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	size_t rounds = *slen / 48;
 
-		// Interleave and store output:
-		vst4q_u8(*o, out);
+	*slen -= rounds * 48;	// 48 bytes consumed per round
+	*olen += rounds * 64;	// 64 bytes produced per round
 
-		*s += 48;
-		*o += 64;
+	while (rounds > 0) {
+		if (rounds >= 8) {
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_neon32_inner(s, o);
+		break;
 	}
 }
diff --git a/lib/arch/neon32/enc_translate.c b/lib/arch/neon32/enc_translate.c
@@ -1,5 +1,5 @@
 static inline uint8x16x4_t
-enc_translate (uint8x16x4_t in)
+enc_translate (const uint8x16x4_t in)
 {
 	// A lookup table containing the absolute offsets for all ranges:
 	const uint8x16_t lut = {