diff --git a/lib/arch/generic/32/dec_loop.c b/lib/arch/generic/32/dec_loop.c
index e2ac8248..bc6abc51 100644
--- a/lib/arch/generic/32/dec_loop.c
+++ b/lib/arch/generic/32/dec_loop.c
@@ -1,36 +1,47 @@
-// Read source 4 bytes at a time
-// Since we might be writing one byte more than needed,
-// we need to make sure there will still be some room
-// for one extra byte in o.
-// This will be the case if srclen > 0 when the loop
-// is exited
-while (srclen > 4)
+static inline void
+dec_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 {
-	const uint32_t str
-		= base64_table_dec_d0[c[0]]
-	        | base64_table_dec_d1[c[1]]
-	        | base64_table_dec_d2[c[2]]
-	        | base64_table_dec_d3[c[3]];
+	if (*slen < 8) {
+		return;
+	}
+
+	// Process blocks of 4 bytes per round. Because one extra zero byte is
+	// written after the output, ensure that there will be at least 4 bytes
+	// of input data left to cover the gap. (Two data bytes and up to two
+	// end-of-string markers.)
+	size_t rounds = (*slen - 4) / 4;
+
+	*slen -= rounds * 4;	// 4 bytes consumed per round
+	*olen += rounds * 3;	// 3 bytes produced per round
+
+	do {
+		const uint32_t str
+			= base64_table_dec_d0[(*s)[0]]
+			| base64_table_dec_d1[(*s)[1]]
+			| base64_table_dec_d2[(*s)[2]]
+			| base64_table_dec_d3[(*s)[3]];
 
 #if BASE64_LITTLE_ENDIAN
-	// LUTs for little-endian set Most Significant Bit
-	// in case of invalid character
-	if (str & UINT32_C(0x80000000)) {
-		break;
-	}
+
+		// LUTs for little-endian set MSB in case of invalid character:
+		if (str & UINT32_C(0x80000000)) {
+			break;
+		}
 #else
-	// LUTs for big-endian set Least Significant Bit
-	// in case of invalid character
-	if (str & UINT32_C(1)) {
-		break;
-	}
+		// LUTs for big-endian set LSB in case of invalid character:
+		if (str & UINT32_C(1)) {
+			break;
+		}
 #endif
+		// Store the output:
+		memcpy(*o, &str, sizeof (str));
+
+		*s += 4;
+		*o += 3;
 
-	// Store:
-	memcpy(o, &str, sizeof (str));
+	} while (--rounds > 0);
 
-	c += 4;
-	o += 3;
-	outl += 3;
-	srclen -= 4;
+	// Adjust for any rounds that were skipped:
+	*slen += rounds * 4;
+	*olen -= rounds * 3;
 }
diff --git a/lib/arch/generic/codec.c b/lib/arch/generic/codec.c
index 32f011c5..e53ec068 100644
--- a/lib/arch/generic/codec.c
+++ b/lib/arch/generic/codec.c
@@ -11,6 +11,10 @@
 #  include "64/enc_loop.c"
 #endif
 
+#if BASE64_WORDSIZE >= 32
+#  include "32/dec_loop.c"
+#endif
+
 BASE64_ENC_FUNCTION(plain)
 {
 	#include "enc_head.c"
@@ -26,7 +30,7 @@ BASE64_DEC_FUNCTION(plain)
 {
 	#include "dec_head.c"
 #if BASE64_WORDSIZE >= 32
-	#include "32/dec_loop.c"
+	dec_loop_generic_32(&c, &srclen, &o, &outl);
 #endif
 	#include "dec_tail.c"
 }
diff --git a/lib/arch/neon32/codec.c b/lib/arch/neon32/codec.c
index 2dbb57f6..57181cc5 100644
--- a/lib/arch/neon32/codec.c
+++ b/lib/arch/neon32/codec.c
@@ -31,6 +31,7 @@ vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
 	return vcombine_u8(result.val[0], result.val[1]);
 }
 
+#include "../generic/32/dec_loop.c"
 #include "../generic/32/enc_loop.c"
 #include "dec_loop.c"
 #include "enc_reshuffle.c"
@@ -60,7 +61,7 @@ BASE64_DEC_FUNCTION(neon32)
 #ifdef BASE64_USE_NEON32
 	#include "../generic/dec_head.c"
 	dec_loop_neon32(&c, &srclen, &o, &outl);
-	#include "../generic/32/dec_loop.c"
+	dec_loop_generic_32(&c, &srclen, &o, &outl);
 	#include "../generic/dec_tail.c"
 #else
 	BASE64_DEC_STUB
diff --git a/lib/arch/neon64/codec.c b/lib/arch/neon64/codec.c
index 2be25b1d..aa0d1948 100644
--- a/lib/arch/neon64/codec.c
+++ b/lib/arch/neon64/codec.c
@@ -30,6 +30,7 @@ load_64byte_table (const uint8_t *p)
 #endif
 }
 
+#include "../generic/32/dec_loop.c"
 #include "../generic/64/enc_loop.c"
 #include "dec_loop.c"
 #include "enc_loop.c"
@@ -57,7 +58,7 @@ BASE64_DEC_FUNCTION(neon64)
 #ifdef BASE64_USE_NEON64
 	#include "../generic/dec_head.c"
 	dec_loop_neon64(&c, &srclen, &o, &outl);
-	#include "../generic/32/dec_loop.c"
+	dec_loop_generic_32(&c, &srclen, &o, &outl);
 	#include "../generic/dec_tail.c"
 #else
 	BASE64_DEC_STUB