diff --git a/lib/arch/ssse3/dec_loop.c b/lib/arch/ssse3/dec_loop.c index 1423e886..9da71abe 100644 --- a/lib/arch/ssse3/dec_loop.c +++ b/lib/arch/ssse3/dec_loop.c @@ -65,22 +65,9 @@ // 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 // 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 -static inline void -dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +static inline int +dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds) { - if (*slen < 24) { - return; - } - - // Process blocks of 16 bytes per round. Because 4 extra zero bytes are - // written after the output, ensure that there will be at least 8 bytes - // of input data left to cover the gap. (6 data bytes and up to two - // end-of-string markers.) - size_t rounds = (*slen - 8) / 16; - - *slen -= rounds * 16; // 16 bytes consumed per round - *olen += rounds * 12; // 12 bytes produced per round - const __m128i lut_lo = _mm_setr_epi8( 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A); @@ -95,39 +82,90 @@ dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) const __m128i mask_2F = _mm_set1_epi8(0x2F); - do { - // Load input: - __m128i str = _mm_loadu_si128((__m128i *) *s); - - // Table lookups: - const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F); - const __m128i lo_nibbles = _mm_and_si128(str, mask_2F); - const __m128i hi = _mm_shuffle_epi8(lut_hi, hi_nibbles); - const __m128i lo = _mm_shuffle_epi8(lut_lo, lo_nibbles); - - // Check for invalid input: if any "and" values from lo and hi - // are not zero, fall back on bytewise code to do error - // checking and reporting: - if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) { - break; - } + // Load input: + __m128i str = _mm_loadu_si128((__m128i *) *s); - const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F); - const __m128i roll = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles)); + // Table lookups: + const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F); + const __m128i lo_nibbles = _mm_and_si128(str, mask_2F); + const __m128i hi = _mm_shuffle_epi8(lut_hi, hi_nibbles); + const __m128i lo = _mm_shuffle_epi8(lut_lo, lo_nibbles); - // Now simply add the delta values to the input: - str = _mm_add_epi8(str, roll); + // Check for invalid input: if any "and" values from lo and hi are not + // zero, fall back on bytewise code to do error checking and reporting: + if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) { + return 0; + } + + const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F); + const __m128i roll = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles)); + + // Now simply add the delta values to the input: + str = _mm_add_epi8(str, roll); - // Reshuffle the input to packed 12-byte output format: - str = dec_reshuffle(str); + // Reshuffle the input to packed 12-byte output format: + str = dec_reshuffle(str); - // Store the output: - _mm_storeu_si128((__m128i *) *o, str); + // Store the output: + _mm_storeu_si128((__m128i *) *o, str); + + *s += 16; + *o += 12; + *rounds -= 1; + + return 1; +} - *s += 16; - *o += 12; +static inline void +dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 24) { + return; + } + + // Process blocks of 16 bytes per round. Because 4 extra zero bytes are + // written after the output, ensure that there will be at least 8 bytes + // of input data left to cover the gap. (6 data bytes and up to two + // end-of-string markers.) + size_t rounds = (*slen - 8) / 16; + + *slen -= rounds * 16; // 16 bytes consumed per round + *olen += rounds * 12; // 12 bytes produced per round + + do { + if (rounds >= 8) { + if (dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 4) { + if (dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 2) { + if (dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds)) { + continue; + } + break; + } + dec_loop_ssse3_inner(s, o, &rounds); + break; - } while (--rounds > 0); + } while (rounds > 0); // Adjust for any rounds that were skipped: *slen += rounds * 16;