diff --git a/lib/arch/ssse3/enc_loop.c b/lib/arch/ssse3/enc_loop.c index 87a50784..6de652e1 100644 --- a/lib/arch/ssse3/enc_loop.c +++ b/lib/arch/ssse3/enc_loop.c @@ -1,3 +1,22 @@ +static inline void +enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o) +{ + // Load input: + __m128i str = _mm_loadu_si128((__m128i *) *s); + + // Reshuffle: + str = enc_reshuffle(str); + + // Translate reshuffled bytes to the Base64 alphabet: + str = enc_translate(str); + + // Store: + _mm_storeu_si128((__m128i *) *o, str); + + *s += 12; + *o += 16; +} + static inline void enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) { @@ -15,20 +34,34 @@ enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) *olen += rounds * 16; // 16 bytes produced per round do { - // Load string: - __m128i str = _mm_loadu_si128((__m128i *) *s); - - // Reshuffle: - str = enc_reshuffle(str); - - // Translate reshuffled bytes to the Base64 alphabet: - str = enc_translate(str); - - // Store: - _mm_storeu_si128((__m128i *) *o, str); - - *s += 12; - *o += 16; + if (rounds >= 8) { + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_ssse3_inner(s, o); + break; - } while (--rounds > 0); + } while (rounds > 0); }