diff --git a/README.md b/README.md
index 8bb0fc6d..9fec5c74 100644
--- a/README.md
+++ b/README.md
@@ -434,11 +434,11 @@ x86 processors
 | i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4884\*    | 7099\*    | 4917\*    | 7057\*    | 4799\*  | 7143\*  | 4902\*   | 7219\*   |
 | i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 5212\*    | 8849\*    | 5284\*    | 9099\*    | 5289\*  | 9220\*  | 4849\*   | 9200\*   |
 | i7-4870HQ @ 2.5 GHz                       | 1471\*    | 3066\*    | 6721\*    | 6962\*    | 7015\*  | 8267\*  | 8328\*   | 11576\*  |
-| i5-4590S @ 3.0 GHz                        | 1858      | 3090      | 4329      | 5825      | 4305    | 6092    | 4241     | 6174     |
-| Xeon X5570 @ 2.93 GHz                     | 1137      | 1473      | 2536      | 3283      | -       | -       | -        | -        |
+| i5-4590S @ 3.0 GHz                        | 1843      | 3090      | 4363      | 5825      | 4243    | 6092    | 4160     | 6174     |
+| Xeon X5570 @ 2.93 GHz                     | 1151      | 1473      | 3160      | 3283      | -       | -       | -        | -        |
 | Pentium4 @ 3.4 GHz                        | 528\*     | 448\*     | -         | -         | -       | -       | -        | -        |
-| Atom N270                                 | 126       | 233       | 484       | 407       | -       | -       | -        | -        |
-| AMD E-450                                 | 378       | 564       | 614       | 660       | -       | -       | -        | -        |
+| Atom N270                                 | 108       | 233       | 508       | 407       | -       | -       | -        | -        |
+| AMD E-450                                 | 395       | 564       | 625       | 660       | -       | -       | -        | -        |
 | Intel Edison @ 500 MHz                    | 79\*      | 92\*      | 152\*     | 172\*     | -       | -       | -        | -        |
 | Intel Edison @ 500 MHz OPENMP 2 thread    | 158\*     | 184\*     | 300\*     | 343\*     | -       | -       | -        | -        |
 | Intel Edison @ 500 MHz (x86-64)           | 97\*      | 146\*     | 197\*     | 207\*     | -       | -       | -        | -        |
@@ -449,7 +449,7 @@ ARM processors
 | Processor                                 | Plain enc | Plain dec | NEON32 enc | NEON32 dec | NEON64 enc | NEON64 dec |
 |-------------------------------------------|----------:|----------:|-----------:|-----------:|-----------:|-----------:|
 | Raspberry PI B+ V1.2                      | 46\*      | 40\*      | -          | -          | -          | -          |
-| Raspberry PI 2 B V1.1                     | 86        | 84        | 188        | 225        | -          | -          |
+| Raspberry PI 2 B V1.1                     | 87        | 84        | 192        | 225        | -          | -          |
 | Apple iPhone SE armv7                     | 1056\*    | 895\*     | 2943\*     | 2618\*     | -          | -          |
 | Apple iPhone SE arm64                     | 1061\*    | 1239\*    | -          | -          | 4098\*     | 3983\*     |
 
diff --git a/lib/arch/avx2/enc_loop.c b/lib/arch/avx2/enc_loop.c
index a4992240..b9e2736f 100644
--- a/lib/arch/avx2/enc_loop.c
+++ b/lib/arch/avx2/enc_loop.c
@@ -1,3 +1,37 @@
+static inline void
+enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
+{
+	// First load is done at s - 0 to not get a segfault:
+	__m256i src = _mm256_loadu_si256((__m256i *) *s);
+
+	// Shift by 4 bytes, as required by enc_reshuffle:
+	src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
+
+	// Reshuffle, translate, store:
+	src = enc_reshuffle(src);
+	src = enc_translate(src);
+	_mm256_storeu_si256((__m256i *) *o, src);
+
+	// Subsequent loads will be done at s - 4, set pointer for next round:
+	*s += 20;
+	*o += 32;
+}
+
+static inline void
+enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
+{
+	// Load input:
+	__m256i src = _mm256_loadu_si256((__m256i *) *s);
+
+	// Reshuffle, translate, store:
+	src = enc_reshuffle(src);
+	src = enc_translate(src);
+	_mm256_storeu_si256((__m256i *) *o, src);
+
+	*s += 24;
+	*o += 32;
+}
+
 static inline void
 enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 {
@@ -14,30 +48,40 @@ enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 	*slen -= rounds * 24;   // 24 bytes consumed per round
 	*olen += rounds * 32;   // 32 bytes produced per round
 
-	// First load is done at s - 0 to not get a segfault:
-	__m256i inputvector = _mm256_loadu_si256((__m256i *) *s);
-
-	// Subsequent loads will be done at s - 4, set pointer for next round:
-	*s += 20;
-
-	// Shift by 4 bytes, as required by enc_reshuffle:
-	inputvector = _mm256_permutevar8x32_epi32(inputvector, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
-
-	for (;;) {
-
-		// Reshuffle, translate, store:
-		inputvector = enc_reshuffle(inputvector);
-		inputvector = enc_translate(inputvector);
-		_mm256_storeu_si256((__m256i *) *o, inputvector);
-		*o += 32;
+	// The first loop iteration requires special handling to ensure that
+	// the read, which is done at an offset, does not underflow the buffer:
+	enc_loop_avx2_inner_first(s, o);
+	rounds--;
 
-		if (--rounds == 0) {
-			break;
+	while (rounds > 0) {
+		if (rounds >= 8) {
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			rounds -= 8;
+			continue;
 		}
-
-		// Load for the next round:
-		inputvector = _mm256_loadu_si256((__m256i *) *s);
-		*s += 24;
+		if (rounds >= 4) {
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_avx2_inner(s, o);
+		break;
 	}
 
 	// Add the offset back:
diff --git a/lib/arch/generic/32/enc_loop.c b/lib/arch/generic/32/enc_loop.c
index d03de5a2..3c333a1f 100644
--- a/lib/arch/generic/32/enc_loop.c
+++ b/lib/arch/generic/32/enc_loop.c
@@ -1,3 +1,27 @@
+static inline void
+enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o)
+{
+	uint32_t src;
+
+	// Load input:
+	memcpy(&src, *s, sizeof (src));
+
+	// Reorder to 32-bit big-endian, if not already in that format. The
+	// workset must be in big-endian, otherwise the shifted bits do not
+	// carry over properly among adjacent bytes:
+	src = BASE64_HTOBE32(src);
+
+	// Shift input by 6 bytes each round and mask in only the lower 6 bits;
+	// look up the character in the Base64 encoding table and write it to
+	// the output location:
+	*(*o)++ = base64_table_enc[(src >> 26) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 20) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 14) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >>  8) & 0x3F];
+
+	*s += 3;
+}
+
 static inline void
 enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 {
@@ -15,25 +39,34 @@ enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 	*olen += rounds * 4;	// 4 bytes produced per round
 
 	do {
-		uint32_t src;
-
-		// Load input:
-		memcpy(&src, *s, sizeof (src));
-
-		// Reorder to 32-bit big-endian, if not already in that format.
-		// The workset must be in big-endian, otherwise the shifted
-		// bits do not carry over properly among adjacent bytes:
-		src = BASE64_HTOBE32(src);
-
-		// Shift input by 6 bytes each round and mask in only the lower
-		// 6 bits; look up the character in the Base64 encoding table
-		// and write it to the output location:
-		*(*o)++ = base64_table_enc[(src >> 26) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 20) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 14) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >>  8) & 0x3F];
-
-		*s += 3;
+		if (rounds >= 8) {
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_generic_32_inner(s, o);
+		break;
 
-	} while (--rounds > 0);
+	} while (rounds > 0);
 }
diff --git a/lib/arch/generic/64/enc_loop.c b/lib/arch/generic/64/enc_loop.c
index 05236f64..19a0267d 100644
--- a/lib/arch/generic/64/enc_loop.c
+++ b/lib/arch/generic/64/enc_loop.c
@@ -1,3 +1,31 @@
+static inline void
+enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o)
+{
+	uint64_t src;
+
+	// Load input:
+	memcpy(&src, *s, sizeof (src));
+
+	// Reorder to 64-bit big-endian, if not already in that format. The
+	// workset must be in big-endian, otherwise the shifted bits do not
+	// carry over properly among adjacent bytes:
+	src = BASE64_HTOBE64(src);
+
+	// Shift input by 6 bytes each round and mask in only the lower 6 bits;
+	// look up the character in the Base64 encoding table and write it to
+	// the output location:
+	*(*o)++ = base64_table_enc[(src >> 58) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 52) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 46) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 40) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 34) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 28) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 22) & 0x3F];
+	*(*o)++ = base64_table_enc[(src >> 16) & 0x3F];
+
+	*s += 6;
+}
+
 static inline void
 enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 {
@@ -15,29 +43,34 @@ enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 	*olen += rounds * 8;	// 8 bytes produced per round
 
 	do {
-		uint64_t src;
-
-		// Load input:
-		memcpy(&src, *s, sizeof (src));
-
-		// Reorder to 64-bit big-endian, if not already in that format.
-		// The workset must be in big-endian, otherwise the shifted
-		// bits do not carry over properly among adjacent bytes:
-		src = BASE64_HTOBE64(src);
-
-		// Shift input by 6 bytes each round and mask in only the lower
-		// 6 bits; look up the character in the Base64 encoding table
-		// and write it to the output location:
-		*(*o)++ = base64_table_enc[(src >> 58) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 52) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 46) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 40) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 34) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 28) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 22) & 0x3F];
-		*(*o)++ = base64_table_enc[(src >> 16) & 0x3F];
-
-		*s += 6;
-
-	} while (--rounds > 0);
+		if (rounds >= 8) {
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_generic_64_inner(s, o);
+		break;
+
+	} while (rounds > 0);
 }
diff --git a/lib/arch/neon32/enc_loop.c b/lib/arch/neon32/enc_loop.c
index f48d7044..dde66574 100644
--- a/lib/arch/neon32/enc_loop.c
+++ b/lib/arch/neon32/enc_loop.c
@@ -1,26 +1,58 @@
 static inline void
-enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
 {
-	size_t rounds = *slen / 48;
+	// Load 48 bytes and deinterleave:
+	uint8x16x3_t src = vld3q_u8(*s);
 
-	*slen -= rounds * 48;	// 48 bytes consumed per round
-	*olen += rounds * 64;	// 64 bytes produced per round
+	// Reshuffle:
+	uint8x16x4_t out = enc_reshuffle(src);
 
-	while (rounds-- > 0) {
+	// Translate reshuffled bytes to the Base64 alphabet:
+	out = enc_translate(out);
 
-		// Load 48 bytes and deinterleave:
-		uint8x16x3_t src = vld3q_u8(*s);
+	// Interleave and store output:
+	vst4q_u8(*o, out);
 
-		// Reshuffle:
-		uint8x16x4_t out = enc_reshuffle(src);
+	*s += 48;
+	*o += 64;
+}
 
-		// Translate reshuffled bytes to the Base64 alphabet:
-		out = enc_translate(out);
+static inline void
+enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	size_t rounds = *slen / 48;
 
-		// Interleave and store output:
-		vst4q_u8(*o, out);
+	*slen -= rounds * 48;	// 48 bytes consumed per round
+	*olen += rounds * 64;	// 64 bytes produced per round
 
-		*s += 48;
-		*o += 64;
+	while (rounds > 0) {
+		if (rounds >= 8) {
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_neon32_inner(s, o);
+		break;
 	}
 }
diff --git a/lib/arch/neon32/enc_translate.c b/lib/arch/neon32/enc_translate.c
index ff460824..e616d54b 100644
--- a/lib/arch/neon32/enc_translate.c
+++ b/lib/arch/neon32/enc_translate.c
@@ -1,5 +1,5 @@
 static inline uint8x16x4_t
-enc_translate (uint8x16x4_t in)
+enc_translate (const uint8x16x4_t in)
 {
 	// A lookup table containing the absolute offsets for all ranges:
 	const uint8x16_t lut = {
diff --git a/lib/arch/neon64/enc_loop.c b/lib/arch/neon64/enc_loop.c
index 74fcf6e0..b9dcd322 100644
--- a/lib/arch/neon64/enc_loop.c
+++ b/lib/arch/neon64/enc_loop.c
@@ -1,3 +1,38 @@
+static inline void
+enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t *tbl_enc)
+{
+	uint8x16x4_t out;
+
+	// Load 48 bytes and deinterleave:
+	const uint8x16x3_t src = vld3q_u8(*s);
+
+	// Divide bits of three input bytes over four output bytes:
+	out.val[0] = vshrq_n_u8(src.val[0], 2);
+	out.val[1] = vshrq_n_u8(src.val[1], 4) | vshlq_n_u8(src.val[0], 4);
+	out.val[2] = vshrq_n_u8(src.val[2], 6) | vshlq_n_u8(src.val[1], 2);
+	out.val[3] = src.val[2];
+
+	// Clear top two bits:
+	out.val[0] &= vdupq_n_u8(0x3F);
+	out.val[1] &= vdupq_n_u8(0x3F);
+	out.val[2] &= vdupq_n_u8(0x3F);
+	out.val[3] &= vdupq_n_u8(0x3F);
+
+	// The bits have now been shifted to the right locations;
+	// translate their values 0..63 to the Base64 alphabet.
+	// Use a 64-byte table lookup:
+	out.val[0] = vqtbl4q_u8(*tbl_enc, out.val[0]);
+	out.val[1] = vqtbl4q_u8(*tbl_enc, out.val[1]);
+	out.val[2] = vqtbl4q_u8(*tbl_enc, out.val[2]);
+	out.val[3] = vqtbl4q_u8(*tbl_enc, out.val[3]);
+
+	// Interleave and store output:
+	vst4q_u8(*o, out);
+
+	*s += 48;
+	*o += 64;
+}
+
 static inline void
 enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 {
@@ -9,37 +44,34 @@ enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 	// Load the encoding table:
 	const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc);
 
-	while (rounds-- > 0) {
-		uint8x16x3_t src;
-		uint8x16x4_t out;
-
-		// Load 48 bytes and deinterleave:
-		src = vld3q_u8(*s);
-
-		// Divide bits of three input bytes over four output bytes:
-		out.val[0] = vshrq_n_u8(src.val[0], 2);
-		out.val[1] = vshrq_n_u8(src.val[1], 4) | vshlq_n_u8(src.val[0], 4);
-		out.val[2] = vshrq_n_u8(src.val[2], 6) | vshlq_n_u8(src.val[1], 2);
-		out.val[3] = src.val[2];
-
-		// Clear top two bits:
-		out.val[0] &= vdupq_n_u8(0x3F);
-		out.val[1] &= vdupq_n_u8(0x3F);
-		out.val[2] &= vdupq_n_u8(0x3F);
-		out.val[3] &= vdupq_n_u8(0x3F);
-
-		// The bits have now been shifted to the right locations;
-		// translate their values 0..63 to the Base64 alphabet.
-		// Use a 64-byte table lookup:
-		out.val[0] = vqtbl4q_u8(tbl_enc, out.val[0]);
-		out.val[1] = vqtbl4q_u8(tbl_enc, out.val[1]);
-		out.val[2] = vqtbl4q_u8(tbl_enc, out.val[2]);
-		out.val[3] = vqtbl4q_u8(tbl_enc, out.val[3]);
-
-		// Interleave and store output:
-		vst4q_u8(*o, out);
-
-		*s += 48;
-		*o += 64;
+	while (rounds > 0) {
+		if (rounds >= 8) {
+			enc_loop_neon64_inner(s, o, &tbl_enc);
+			enc_loop_neon64_inner(s, o, &tbl_enc);
+			enc_loop_neon64_inner(s, o, &tbl_enc);
+			enc_loop_neon64_inner(s, o, &tbl_enc);
+			enc_loop_neon64_inner(s, o, &tbl_enc);
+			enc_loop_neon64_inner(s, o, &tbl_enc);
+			enc_loop_neon64_inner(s, o, &tbl_enc);
+			enc_loop_neon64_inner(s, o, &tbl_enc);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_neon64_inner(s, o, &tbl_enc);
+			enc_loop_neon64_inner(s, o, &tbl_enc);
+			enc_loop_neon64_inner(s, o, &tbl_enc);
+			enc_loop_neon64_inner(s, o, &tbl_enc);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_neon64_inner(s, o, &tbl_enc);
+			enc_loop_neon64_inner(s, o, &tbl_enc);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_neon64_inner(s, o, &tbl_enc);
+		break;
 	}
 }
diff --git a/lib/arch/ssse3/enc_loop.c b/lib/arch/ssse3/enc_loop.c
index 87a50784..6de652e1 100644
--- a/lib/arch/ssse3/enc_loop.c
+++ b/lib/arch/ssse3/enc_loop.c
@@ -1,3 +1,22 @@
+static inline void
+enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o)
+{
+	// Load input:
+	__m128i str = _mm_loadu_si128((__m128i *) *s);
+
+	// Reshuffle:
+	str = enc_reshuffle(str);
+
+	// Translate reshuffled bytes to the Base64 alphabet:
+	str = enc_translate(str);
+
+	// Store:
+	_mm_storeu_si128((__m128i *) *o, str);
+
+	*s += 12;
+	*o += 16;
+}
+
 static inline void
 enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 {
@@ -15,20 +34,34 @@ enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 	*olen += rounds * 16;	// 16 bytes produced per round
 
 	do {
-		// Load string:
-		__m128i str = _mm_loadu_si128((__m128i *) *s);
-
-		// Reshuffle:
-		str = enc_reshuffle(str);
-
-		// Translate reshuffled bytes to the Base64 alphabet:
-		str = enc_translate(str);
-
-		// Store:
-		_mm_storeu_si128((__m128i *) *o, str);
-
-		*s += 12;
-		*o += 16;
+		if (rounds >= 8) {
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_ssse3_inner(s, o);
+		break;
 
-	} while (--rounds > 0);
+	} while (rounds > 0);
 }