diff --git a/lib/arch/generic/32/dec_loop.c b/lib/arch/generic/32/dec_loop.c index e2ac8248..bc6abc51 100644 --- a/lib/arch/generic/32/dec_loop.c +++ b/lib/arch/generic/32/dec_loop.c @@ -1,36 +1,47 @@ -// Read source 4 bytes at a time -// Since we might be writing one byte more than needed, -// we need to make sure there will still be some room -// for one extra byte in o. -// This will be the case if srclen > 0 when the loop -// is exited -while (srclen > 4) +static inline void +dec_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) { - const uint32_t str - = base64_table_dec_d0[c[0]] - | base64_table_dec_d1[c[1]] - | base64_table_dec_d2[c[2]] - | base64_table_dec_d3[c[3]]; + if (*slen < 8) { + return; + } + + // Process blocks of 4 bytes per round. Because one extra zero byte is + // written after the output, ensure that there will be at least 4 bytes + // of input data left to cover the gap. (Two data bytes and up to two + // end-of-string markers.) + size_t rounds = (*slen - 4) / 4; + + *slen -= rounds * 4; // 4 bytes consumed per round + *olen += rounds * 3; // 3 bytes produced per round + + do { + const uint32_t str + = base64_table_dec_d0[(*s)[0]] + | base64_table_dec_d1[(*s)[1]] + | base64_table_dec_d2[(*s)[2]] + | base64_table_dec_d3[(*s)[3]]; #if BASE64_LITTLE_ENDIAN - // LUTs for little-endian set Most Significant Bit - // in case of invalid character - if (str & UINT32_C(0x80000000)) { - break; - } + + // LUTs for little-endian set MSB in case of invalid character: + if (str & UINT32_C(0x80000000)) { + break; + } #else - // LUTs for big-endian set Least Significant Bit - // in case of invalid character - if (str & UINT32_C(1)) { - break; - } + // LUTs for big-endian set LSB in case of invalid character: + if (str & UINT32_C(1)) { + break; + } #endif + // Store the output: + memcpy(*o, &str, sizeof (str)); + + *s += 4; + *o += 3; - // Store: - memcpy(o, &str, sizeof (str)); + } while (--rounds > 0); - c += 4; - o += 3; - outl += 3; - srclen -= 4; + // Adjust for any rounds that were skipped: + *slen += rounds * 4; + *olen -= rounds * 3; } diff --git a/lib/arch/generic/codec.c b/lib/arch/generic/codec.c index 32f011c5..e53ec068 100644 --- a/lib/arch/generic/codec.c +++ b/lib/arch/generic/codec.c @@ -11,6 +11,10 @@ # include "64/enc_loop.c" #endif +#if BASE64_WORDSIZE >= 32 +# include "32/dec_loop.c" +#endif + BASE64_ENC_FUNCTION(plain) { #include "enc_head.c" @@ -26,7 +30,7 @@ BASE64_DEC_FUNCTION(plain) { #include "dec_head.c" #if BASE64_WORDSIZE >= 32 - #include "32/dec_loop.c" + dec_loop_generic_32(&c, &srclen, &o, &outl); #endif #include "dec_tail.c" } diff --git a/lib/arch/neon32/codec.c b/lib/arch/neon32/codec.c index 2dbb57f6..57181cc5 100644 --- a/lib/arch/neon32/codec.c +++ b/lib/arch/neon32/codec.c @@ -31,6 +31,7 @@ vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices) return vcombine_u8(result.val[0], result.val[1]); } +#include "../generic/32/dec_loop.c" #include "../generic/32/enc_loop.c" #include "dec_loop.c" #include "enc_reshuffle.c" @@ -60,7 +61,7 @@ BASE64_DEC_FUNCTION(neon32) #ifdef BASE64_USE_NEON32 #include "../generic/dec_head.c" dec_loop_neon32(&c, &srclen, &o, &outl); - #include "../generic/32/dec_loop.c" + dec_loop_generic_32(&c, &srclen, &o, &outl); #include "../generic/dec_tail.c" #else BASE64_DEC_STUB diff --git a/lib/arch/neon64/codec.c b/lib/arch/neon64/codec.c index 2be25b1d..aa0d1948 100644 --- a/lib/arch/neon64/codec.c +++ b/lib/arch/neon64/codec.c @@ -30,6 +30,7 @@ load_64byte_table (const uint8_t *p) #endif } +#include "../generic/32/dec_loop.c" #include "../generic/64/enc_loop.c" #include "dec_loop.c" #include "enc_loop.c" @@ -57,7 +58,7 @@ BASE64_DEC_FUNCTION(neon64) #ifdef BASE64_USE_NEON64 #include "../generic/dec_head.c" dec_loop_neon64(&c, &srclen, &o, &outl); - #include "../generic/32/dec_loop.c" + dec_loop_generic_32(&c, &srclen, &o, &outl); #include "../generic/dec_tail.c" #else BASE64_DEC_STUB