diff --git a/README.md b/README.md index 8bb0fc6d..9fec5c74 100644 --- a/README.md +++ b/README.md @@ -434,11 +434,11 @@ x86 processors | i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4884\* | 7099\* | 4917\* | 7057\* | 4799\* | 7143\* | 4902\* | 7219\* | | i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 5212\* | 8849\* | 5284\* | 9099\* | 5289\* | 9220\* | 4849\* | 9200\* | | i7-4870HQ @ 2.5 GHz | 1471\* | 3066\* | 6721\* | 6962\* | 7015\* | 8267\* | 8328\* | 11576\* | -| i5-4590S @ 3.0 GHz | 1858 | 3090 | 4329 | 5825 | 4305 | 6092 | 4241 | 6174 | -| Xeon X5570 @ 2.93 GHz | 1137 | 1473 | 2536 | 3283 | - | - | - | - | +| i5-4590S @ 3.0 GHz | 1843 | 3090 | 4363 | 5825 | 4243 | 6092 | 4160 | 6174 | +| Xeon X5570 @ 2.93 GHz | 1151 | 1473 | 3160 | 3283 | - | - | - | - | | Pentium4 @ 3.4 GHz | 528\* | 448\* | - | - | - | - | - | - | -| Atom N270 | 126 | 233 | 484 | 407 | - | - | - | - | -| AMD E-450 | 378 | 564 | 614 | 660 | - | - | - | - | +| Atom N270 | 108 | 233 | 508 | 407 | - | - | - | - | +| AMD E-450 | 395 | 564 | 625 | 660 | - | - | - | - | | Intel Edison @ 500 MHz | 79\* | 92\* | 152\* | 172\* | - | - | - | - | | Intel Edison @ 500 MHz OPENMP 2 thread | 158\* | 184\* | 300\* | 343\* | - | - | - | - | | Intel Edison @ 500 MHz (x86-64) | 97\* | 146\* | 197\* | 207\* | - | - | - | - | @@ -449,7 +449,7 @@ ARM processors | Processor | Plain enc | Plain dec | NEON32 enc | NEON32 dec | NEON64 enc | NEON64 dec | |-------------------------------------------|----------:|----------:|-----------:|-----------:|-----------:|-----------:| | Raspberry PI B+ V1.2 | 46\* | 40\* | - | - | - | - | -| Raspberry PI 2 B V1.1 | 86 | 84 | 188 | 225 | - | - | +| Raspberry PI 2 B V1.1 | 87 | 84 | 192 | 225 | - | - | | Apple iPhone SE armv7 | 1056\* | 895\* | 2943\* | 2618\* | - | - | | Apple iPhone SE arm64 | 1061\* | 1239\* | - | - | 4098\* | 3983\* | diff --git a/lib/arch/avx2/enc_loop.c b/lib/arch/avx2/enc_loop.c index a4992240..b9e2736f 100644 --- a/lib/arch/avx2/enc_loop.c +++ b/lib/arch/avx2/enc_loop.c @@ -1,3 +1,37 @@ +static inline void +enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o) +{ + // First load is done at s - 0 to not get a segfault: + __m256i src = _mm256_loadu_si256((__m256i *) *s); + + // Shift by 4 bytes, as required by enc_reshuffle: + src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6)); + + // Reshuffle, translate, store: + src = enc_reshuffle(src); + src = enc_translate(src); + _mm256_storeu_si256((__m256i *) *o, src); + + // Subsequent loads will be done at s - 4, set pointer for next round: + *s += 20; + *o += 32; +} + +static inline void +enc_loop_avx2_inner (const uint8_t **s, uint8_t **o) +{ + // Load input: + __m256i src = _mm256_loadu_si256((__m256i *) *s); + + // Reshuffle, translate, store: + src = enc_reshuffle(src); + src = enc_translate(src); + _mm256_storeu_si256((__m256i *) *o, src); + + *s += 24; + *o += 32; +} + static inline void enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) { @@ -14,30 +48,40 @@ enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) *slen -= rounds * 24; // 24 bytes consumed per round *olen += rounds * 32; // 32 bytes produced per round - // First load is done at s - 0 to not get a segfault: - __m256i inputvector = _mm256_loadu_si256((__m256i *) *s); - - // Subsequent loads will be done at s - 4, set pointer for next round: - *s += 20; - - // Shift by 4 bytes, as required by enc_reshuffle: - inputvector = _mm256_permutevar8x32_epi32(inputvector, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6)); - - for (;;) { - - // Reshuffle, translate, store: - inputvector = enc_reshuffle(inputvector); - inputvector = enc_translate(inputvector); - _mm256_storeu_si256((__m256i *) *o, inputvector); - *o += 32; + // The first loop iteration requires special handling to ensure that + // the read, which is done at an offset, does not underflow the buffer: + enc_loop_avx2_inner_first(s, o); + rounds--; - if (--rounds == 0) { - break; + while (rounds > 0) { + if (rounds >= 8) { + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + rounds -= 8; + continue; } - - // Load for the next round: - inputvector = _mm256_loadu_si256((__m256i *) *s); - *s += 24; + if (rounds >= 4) { + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_avx2_inner(s, o); + break; } // Add the offset back: diff --git a/lib/arch/generic/32/enc_loop.c b/lib/arch/generic/32/enc_loop.c index d03de5a2..3c333a1f 100644 --- a/lib/arch/generic/32/enc_loop.c +++ b/lib/arch/generic/32/enc_loop.c @@ -1,3 +1,27 @@ +static inline void +enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o) +{ + uint32_t src; + + // Load input: + memcpy(&src, *s, sizeof (src)); + + // Reorder to 32-bit big-endian, if not already in that format. The + // workset must be in big-endian, otherwise the shifted bits do not + // carry over properly among adjacent bytes: + src = BASE64_HTOBE32(src); + + // Shift input by 6 bytes each round and mask in only the lower 6 bits; + // look up the character in the Base64 encoding table and write it to + // the output location: + *(*o)++ = base64_table_enc[(src >> 26) & 0x3F]; + *(*o)++ = base64_table_enc[(src >> 20) & 0x3F]; + *(*o)++ = base64_table_enc[(src >> 14) & 0x3F]; + *(*o)++ = base64_table_enc[(src >> 8) & 0x3F]; + + *s += 3; +} + static inline void enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) { @@ -15,25 +39,34 @@ enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) *olen += rounds * 4; // 4 bytes produced per round do { - uint32_t src; - - // Load input: - memcpy(&src, *s, sizeof (src)); - - // Reorder to 32-bit big-endian, if not already in that format. - // The workset must be in big-endian, otherwise the shifted - // bits do not carry over properly among adjacent bytes: - src = BASE64_HTOBE32(src); - - // Shift input by 6 bytes each round and mask in only the lower - // 6 bits; look up the character in the Base64 encoding table - // and write it to the output location: - *(*o)++ = base64_table_enc[(src >> 26) & 0x3F]; - *(*o)++ = base64_table_enc[(src >> 20) & 0x3F]; - *(*o)++ = base64_table_enc[(src >> 14) & 0x3F]; - *(*o)++ = base64_table_enc[(src >> 8) & 0x3F]; - - *s += 3; + if (rounds >= 8) { + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_generic_32_inner(s, o); + break; - } while (--rounds > 0); + } while (rounds > 0); } diff --git a/lib/arch/generic/64/enc_loop.c b/lib/arch/generic/64/enc_loop.c index 05236f64..19a0267d 100644 --- a/lib/arch/generic/64/enc_loop.c +++ b/lib/arch/generic/64/enc_loop.c @@ -1,3 +1,31 @@ +static inline void +enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o) +{ + uint64_t src; + + // Load input: + memcpy(&src, *s, sizeof (src)); + + // Reorder to 64-bit big-endian, if not already in that format. The + // workset must be in big-endian, otherwise the shifted bits do not + // carry over properly among adjacent bytes: + src = BASE64_HTOBE64(src); + + // Shift input by 6 bytes each round and mask in only the lower 6 bits; + // look up the character in the Base64 encoding table and write it to + // the output location: + *(*o)++ = base64_table_enc[(src >> 58) & 0x3F]; + *(*o)++ = base64_table_enc[(src >> 52) & 0x3F]; + *(*o)++ = base64_table_enc[(src >> 46) & 0x3F]; + *(*o)++ = base64_table_enc[(src >> 40) & 0x3F]; + *(*o)++ = base64_table_enc[(src >> 34) & 0x3F]; + *(*o)++ = base64_table_enc[(src >> 28) & 0x3F]; + *(*o)++ = base64_table_enc[(src >> 22) & 0x3F]; + *(*o)++ = base64_table_enc[(src >> 16) & 0x3F]; + + *s += 6; +} + static inline void enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) { @@ -15,29 +43,34 @@ enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) *olen += rounds * 8; // 8 bytes produced per round do { - uint64_t src; - - // Load input: - memcpy(&src, *s, sizeof (src)); - - // Reorder to 64-bit big-endian, if not already in that format. - // The workset must be in big-endian, otherwise the shifted - // bits do not carry over properly among adjacent bytes: - src = BASE64_HTOBE64(src); - - // Shift input by 6 bytes each round and mask in only the lower - // 6 bits; look up the character in the Base64 encoding table - // and write it to the output location: - *(*o)++ = base64_table_enc[(src >> 58) & 0x3F]; - *(*o)++ = base64_table_enc[(src >> 52) & 0x3F]; - *(*o)++ = base64_table_enc[(src >> 46) & 0x3F]; - *(*o)++ = base64_table_enc[(src >> 40) & 0x3F]; - *(*o)++ = base64_table_enc[(src >> 34) & 0x3F]; - *(*o)++ = base64_table_enc[(src >> 28) & 0x3F]; - *(*o)++ = base64_table_enc[(src >> 22) & 0x3F]; - *(*o)++ = base64_table_enc[(src >> 16) & 0x3F]; - - *s += 6; - - } while (--rounds > 0); + if (rounds >= 8) { + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_generic_64_inner(s, o); + break; + + } while (rounds > 0); } diff --git a/lib/arch/neon32/enc_loop.c b/lib/arch/neon32/enc_loop.c index f48d7044..dde66574 100644 --- a/lib/arch/neon32/enc_loop.c +++ b/lib/arch/neon32/enc_loop.c @@ -1,26 +1,58 @@ static inline void -enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +enc_loop_neon32_inner (const uint8_t **s, uint8_t **o) { - size_t rounds = *slen / 48; + // Load 48 bytes and deinterleave: + uint8x16x3_t src = vld3q_u8(*s); - *slen -= rounds * 48; // 48 bytes consumed per round - *olen += rounds * 64; // 64 bytes produced per round + // Reshuffle: + uint8x16x4_t out = enc_reshuffle(src); - while (rounds-- > 0) { + // Translate reshuffled bytes to the Base64 alphabet: + out = enc_translate(out); - // Load 48 bytes and deinterleave: - uint8x16x3_t src = vld3q_u8(*s); + // Interleave and store output: + vst4q_u8(*o, out); - // Reshuffle: - uint8x16x4_t out = enc_reshuffle(src); + *s += 48; + *o += 64; +} - // Translate reshuffled bytes to the Base64 alphabet: - out = enc_translate(out); +static inline void +enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + size_t rounds = *slen / 48; - // Interleave and store output: - vst4q_u8(*o, out); + *slen -= rounds * 48; // 48 bytes consumed per round + *olen += rounds * 64; // 64 bytes produced per round - *s += 48; - *o += 64; + while (rounds > 0) { + if (rounds >= 8) { + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_neon32_inner(s, o); + break; } } diff --git a/lib/arch/neon32/enc_translate.c b/lib/arch/neon32/enc_translate.c index ff460824..e616d54b 100644 --- a/lib/arch/neon32/enc_translate.c +++ b/lib/arch/neon32/enc_translate.c @@ -1,5 +1,5 @@ static inline uint8x16x4_t -enc_translate (uint8x16x4_t in) +enc_translate (const uint8x16x4_t in) { // A lookup table containing the absolute offsets for all ranges: const uint8x16_t lut = { diff --git a/lib/arch/neon64/enc_loop.c b/lib/arch/neon64/enc_loop.c index 74fcf6e0..b9dcd322 100644 --- a/lib/arch/neon64/enc_loop.c +++ b/lib/arch/neon64/enc_loop.c @@ -1,3 +1,38 @@ +static inline void +enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t *tbl_enc) +{ + uint8x16x4_t out; + + // Load 48 bytes and deinterleave: + const uint8x16x3_t src = vld3q_u8(*s); + + // Divide bits of three input bytes over four output bytes: + out.val[0] = vshrq_n_u8(src.val[0], 2); + out.val[1] = vshrq_n_u8(src.val[1], 4) | vshlq_n_u8(src.val[0], 4); + out.val[2] = vshrq_n_u8(src.val[2], 6) | vshlq_n_u8(src.val[1], 2); + out.val[3] = src.val[2]; + + // Clear top two bits: + out.val[0] &= vdupq_n_u8(0x3F); + out.val[1] &= vdupq_n_u8(0x3F); + out.val[2] &= vdupq_n_u8(0x3F); + out.val[3] &= vdupq_n_u8(0x3F); + + // The bits have now been shifted to the right locations; + // translate their values 0..63 to the Base64 alphabet. + // Use a 64-byte table lookup: + out.val[0] = vqtbl4q_u8(*tbl_enc, out.val[0]); + out.val[1] = vqtbl4q_u8(*tbl_enc, out.val[1]); + out.val[2] = vqtbl4q_u8(*tbl_enc, out.val[2]); + out.val[3] = vqtbl4q_u8(*tbl_enc, out.val[3]); + + // Interleave and store output: + vst4q_u8(*o, out); + + *s += 48; + *o += 64; +} + static inline void enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) { @@ -9,37 +44,34 @@ enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) // Load the encoding table: const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc); - while (rounds-- > 0) { - uint8x16x3_t src; - uint8x16x4_t out; - - // Load 48 bytes and deinterleave: - src = vld3q_u8(*s); - - // Divide bits of three input bytes over four output bytes: - out.val[0] = vshrq_n_u8(src.val[0], 2); - out.val[1] = vshrq_n_u8(src.val[1], 4) | vshlq_n_u8(src.val[0], 4); - out.val[2] = vshrq_n_u8(src.val[2], 6) | vshlq_n_u8(src.val[1], 2); - out.val[3] = src.val[2]; - - // Clear top two bits: - out.val[0] &= vdupq_n_u8(0x3F); - out.val[1] &= vdupq_n_u8(0x3F); - out.val[2] &= vdupq_n_u8(0x3F); - out.val[3] &= vdupq_n_u8(0x3F); - - // The bits have now been shifted to the right locations; - // translate their values 0..63 to the Base64 alphabet. - // Use a 64-byte table lookup: - out.val[0] = vqtbl4q_u8(tbl_enc, out.val[0]); - out.val[1] = vqtbl4q_u8(tbl_enc, out.val[1]); - out.val[2] = vqtbl4q_u8(tbl_enc, out.val[2]); - out.val[3] = vqtbl4q_u8(tbl_enc, out.val[3]); - - // Interleave and store output: - vst4q_u8(*o, out); - - *s += 48; - *o += 64; + while (rounds > 0) { + if (rounds >= 8) { + enc_loop_neon64_inner(s, o, &tbl_enc); + enc_loop_neon64_inner(s, o, &tbl_enc); + enc_loop_neon64_inner(s, o, &tbl_enc); + enc_loop_neon64_inner(s, o, &tbl_enc); + enc_loop_neon64_inner(s, o, &tbl_enc); + enc_loop_neon64_inner(s, o, &tbl_enc); + enc_loop_neon64_inner(s, o, &tbl_enc); + enc_loop_neon64_inner(s, o, &tbl_enc); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_neon64_inner(s, o, &tbl_enc); + enc_loop_neon64_inner(s, o, &tbl_enc); + enc_loop_neon64_inner(s, o, &tbl_enc); + enc_loop_neon64_inner(s, o, &tbl_enc); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_neon64_inner(s, o, &tbl_enc); + enc_loop_neon64_inner(s, o, &tbl_enc); + rounds -= 2; + continue; + } + enc_loop_neon64_inner(s, o, &tbl_enc); + break; } } diff --git a/lib/arch/ssse3/enc_loop.c b/lib/arch/ssse3/enc_loop.c index 87a50784..6de652e1 100644 --- a/lib/arch/ssse3/enc_loop.c +++ b/lib/arch/ssse3/enc_loop.c @@ -1,3 +1,22 @@ +static inline void +enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o) +{ + // Load input: + __m128i str = _mm_loadu_si128((__m128i *) *s); + + // Reshuffle: + str = enc_reshuffle(str); + + // Translate reshuffled bytes to the Base64 alphabet: + str = enc_translate(str); + + // Store: + _mm_storeu_si128((__m128i *) *o, str); + + *s += 12; + *o += 16; +} + static inline void enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) { @@ -15,20 +34,34 @@ enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) *olen += rounds * 16; // 16 bytes produced per round do { - // Load string: - __m128i str = _mm_loadu_si128((__m128i *) *s); - - // Reshuffle: - str = enc_reshuffle(str); - - // Translate reshuffled bytes to the Base64 alphabet: - str = enc_translate(str); - - // Store: - _mm_storeu_si128((__m128i *) *o, str); - - *s += 12; - *o += 16; + if (rounds >= 8) { + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_ssse3_inner(s, o); + break; - } while (--rounds > 0); + } while (rounds > 0); }