NEON32: dec: factor decoding loop into inline function

aklomp · Nov 27, 2019 · 06b9de3 · 06b9de3
1 parent 3d3e7ea
commit 06b9de3
Show file tree

Hide file tree

Showing 2 changed files with 83 additions and 83 deletions.
diff --git a/lib/arch/neon32/codec.c b/lib/arch/neon32/codec.c
@@ -32,6 +32,7 @@ vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
 }
 
 #include "../generic/32/enc_loop.c"
+#include "dec_loop.c"
 #include "enc_reshuffle.c"
 #include "enc_translate.c"
 #include "enc_loop.c"
@@ -58,7 +59,7 @@ BASE64_DEC_FUNCTION(neon32)
 {
 #ifdef BASE64_USE_NEON32
 	#include "../generic/dec_head.c"
-	#include "dec_loop.c"
+	dec_loop_neon32(&c, &srclen, &o, &outl);
 	#include "../generic/32/dec_loop.c"
 	#include "../generic/dec_tail.c"
 #else

diff --git a/lib/arch/neon32/dec_loop.c b/lib/arch/neon32/dec_loop.c
@@ -1,107 +1,106 @@
-// If we have NEON support, pick off 64 bytes at a time for as long as we can.
-// Unlike the SSE codecs, we don't write trailing zero bytes to output, so we
-// don't need to check if we have enough remaining input to cover them:
-while (srclen >= 64)
+static inline int
+is_nonzero (const uint8x16_t v)
 {
-	uint8x16x3_t dec;
+	uint64_t u64;
+	const uint64x2_t v64 = vreinterpretq_u64_u8(v);
+	const uint32x2_t v32 = vqmovn_u64(v64);
 
-	// Load 64 bytes and deinterleave:
-	uint8x16x4_t str = vld4q_u8((uint8_t *)c);
+	vst1_u64(&u64, vreinterpret_u64_u32(v32));
+	return u64 != 0;
+}
+
+static inline uint8x16_t
+delta_lookup (const uint8x16_t v)
+{
+	const uint8x8_t lut = {
+		0, 16, 19, 4, (uint8_t) -65, (uint8_t) -65, (uint8_t) -71, (uint8_t) -71,
+	};
 
-	// See ssse3/dec_loop.c for an explanation of how the code works.
+	return vcombine_u8(
+		vtbl1_u8(lut, vget_low_u8(v)),
+		vtbl1_u8(lut, vget_high_u8(v)));
+}
 
+static inline uint8x16_t
+dec_loop_neon32_lane (uint8x16_t *lane)
+{
+	// See the SSSE3 decoder for an explanation of the algorithm.
 	const uint8x16_t lut_lo = {
 		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
 		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A
 	};
+
 	const uint8x16_t lut_hi = {
 		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
 		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
 	};
 
-	const uint8x16_t lut_roll = {
-		0,  16,  19,   4, (uint8_t)-65, (uint8_t)-65, (uint8_t)-71, (uint8_t)-71,
-		0,   0,   0,   0,   0,   0,   0,   0
-	};
-
 	const uint8x16_t mask_0F = vdupq_n_u8(0x0F);
 	const uint8x16_t mask_2F = vdupq_n_u8(0x2F);
 
-	uint8x16_t classified;
+	const uint8x16_t hi_nibbles = vshrq_n_u8(*lane, 4);
+	const uint8x16_t lo_nibbles = vandq_u8(*lane, mask_0F);
+	const uint8x16_t eq_2F      = vceqq_u8(*lane, mask_2F);
 
-	{
-		const uint8x16_t hi_nibbles = vshrq_n_u8(str.val[0], 4);
-		const uint8x16_t lo_nibbles = vandq_u8(str.val[0], mask_0F);
-		const uint8x16_t eq_2F = vceqq_u8(str.val[0], mask_2F);
+	const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
+	const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
 
-		const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
-		const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
+	// Now simply add the delta values to the input:
+	*lane = vaddq_u8(*lane, delta_lookup(vaddq_u8(eq_2F, hi_nibbles)));
 
-		const uint8x16_t delta = vqtbl1q_u8(lut_roll, vaddq_u8(eq_2F, hi_nibbles));
-		classified = vandq_u8(lo, hi);
-		// Now simply add the delta values to the input:
-		str.val[0] = vaddq_u8(str.val[0], delta);
-	}
-	{
-		const uint8x16_t hi_nibbles = vshrq_n_u8(str.val[1], 4);
-		const uint8x16_t lo_nibbles = vandq_u8(str.val[1], mask_0F);
-		const uint8x16_t eq_2F = vceqq_u8(str.val[1], mask_2F);
-
-		const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
-		const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
-
-		const uint8x16_t delta = vqtbl1q_u8(lut_roll, vaddq_u8(eq_2F, hi_nibbles));
-		classified = vorrq_u8(classified, vandq_u8(lo, hi));
-		// Now simply add the delta values to the input:
-		str.val[1] = vaddq_u8(str.val[1], delta);
-	}
-	{
-		const uint8x16_t hi_nibbles = vshrq_n_u8(str.val[2], 4);
-		const uint8x16_t lo_nibbles = vandq_u8(str.val[2], mask_0F);
-		const uint8x16_t eq_2F = vceqq_u8(str.val[2], mask_2F);
-
-		const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
-		const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
-
-		const uint8x16_t delta = vqtbl1q_u8(lut_roll, vaddq_u8(eq_2F, hi_nibbles));
-		classified = vorrq_u8(classified, vandq_u8(lo, hi));
-		// Now simply add the delta values to the input:
-		str.val[2] = vaddq_u8(str.val[2], delta);
-	}
-	{
-		const uint8x16_t hi_nibbles = vshrq_n_u8(str.val[3], 4);
-		const uint8x16_t lo_nibbles = vandq_u8(str.val[3], mask_0F);
-		const uint8x16_t eq_2F = vceqq_u8(str.val[3], mask_2F);
-
-		const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
-		const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
-
-		const uint8x16_t delta = vqtbl1q_u8(lut_roll, vaddq_u8(eq_2F, hi_nibbles));
-		classified = vorrq_u8(classified, vandq_u8(lo, hi));
-		// Now simply add the delta values to the input:
-		str.val[3] = vaddq_u8(str.val[3], delta);
-	}
+	// Return the validity mask:
+	return vandq_u8(lo, hi);
+}
 
-	// Check for invalid input: if any of the delta values are zero,
-	// fall back on bytewise code to do error checking and reporting:
-	// Extract both 32-bit halves; check that all bits are zero:
-	if (vgetq_lane_u32((uint32x4_t)classified, 0) != 0
-	 || vgetq_lane_u32((uint32x4_t)classified, 1) != 0
-	 || vgetq_lane_u32((uint32x4_t)classified, 2) != 0
-	 || vgetq_lane_u32((uint32x4_t)classified, 3) != 0) {
-		break;
+static inline void
+dec_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 64) {
+		return;
 	}
 
-	// Compress four bytes into three:
-	dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
-	dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
-	dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
+	// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
+	// extra trailing zero bytes are written, so it is not necessary to
+	// reserve extra input bytes:
+	size_t rounds = *slen / 64;
+
+	*slen -= rounds * 64;	// 64 bytes consumed per round
+	*olen += rounds * 48;	// 48 bytes produced per round
+
+	do {
+		uint8x16x3_t dec;
+
+		// Load 64 bytes and deinterleave:
+		uint8x16x4_t str = vld4q_u8(*s);
+
+		// Decode each lane, collect a mask of invalid inputs:
+		const uint8x16_t classified
+			= dec_loop_neon32_lane(&str.val[0])
+			| dec_loop_neon32_lane(&str.val[1])
+			| dec_loop_neon32_lane(&str.val[2])
+			| dec_loop_neon32_lane(&str.val[3]);
+
+		// Check for invalid input: if any of the delta values are
+		// zero, fall back on bytewise code to do error checking and
+		// reporting:
+		if (is_nonzero(classified)) {
+			break;
+		}
+
+		// Compress four bytes into three:
+		dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
+		dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
+		dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
+
+		// Interleave and store decoded result:
+		vst3q_u8(*o, dec);
+
+		*s += 64;
+		*o += 48;
 
-	// Interleave and store decoded result:
-	vst3q_u8((uint8_t *)o, dec);
+	} while (--rounds > 0);
 
-	c += 64;
-	o += 48;
-	outl += 48;
-	srclen -= 64;
+	// Adjust for any rounds that were skipped:
+	*slen += rounds * 64;
+	*olen -= rounds * 48;
 }