Skip to content

Commit

Permalink
#61: encoders: unroll inner loops
Browse files Browse the repository at this point in the history
The transformation of encoder kernels to inline functions (#58) allows
us to move the inner encoding loop into separate inline functions.

Because the number of remaining loop iterations is known, we can split
calls to the inner loop into long unrolled stretches. Tests show that
this can result in significant speedups.
  • Loading branch information
aklomp committed Nov 28, 2019
2 parents 0024d2b + 8680b68 commit dd8dc9b
Show file tree
Hide file tree
Showing 8 changed files with 342 additions and 135 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -434,11 +434,11 @@ x86 processors
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4884\* | 7099\* | 4917\* | 7057\* | 4799\* | 7143\* | 4902\* | 7219\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 5212\* | 8849\* | 5284\* | 9099\* | 5289\* | 9220\* | 4849\* | 9200\* |
| i7-4870HQ @ 2.5 GHz | 1471\* | 3066\* | 6721\* | 6962\* | 7015\* | 8267\* | 8328\* | 11576\* |
| i5-4590S @ 3.0 GHz | 1858 | 3090 | 4329 | 5825 | 4305 | 6092 | 4241 | 6174 |
| Xeon X5570 @ 2.93 GHz | 1137 | 1473 | 2536 | 3283 | - | - | - | - |
| i5-4590S @ 3.0 GHz | 1843 | 3090 | 4363 | 5825 | 4243 | 6092 | 4160 | 6174 |
| Xeon X5570 @ 2.93 GHz | 1151 | 1473 | 3160 | 3283 | - | - | - | - |
| Pentium4 @ 3.4 GHz | 528\* | 448\* | - | - | - | - | - | - |
| Atom N270 | 126 | 233 | 484 | 407 | - | - | - | - |
| AMD E-450 | 378 | 564 | 614 | 660 | - | - | - | - |
| Atom N270 | 108 | 233 | 508 | 407 | - | - | - | - |
| AMD E-450 | 395 | 564 | 625 | 660 | - | - | - | - |
| Intel Edison @ 500 MHz | 79\* | 92\* | 152\* | 172\* | - | - | - | - |
| Intel Edison @ 500 MHz OPENMP 2 thread | 158\* | 184\* | 300\* | 343\* | - | - | - | - |
| Intel Edison @ 500 MHz (x86-64) | 97\* | 146\* | 197\* | 207\* | - | - | - | - |
Expand All @@ -449,7 +449,7 @@ ARM processors
| Processor | Plain enc | Plain dec | NEON32 enc | NEON32 dec | NEON64 enc | NEON64 dec |
|-------------------------------------------|----------:|----------:|-----------:|-----------:|-----------:|-----------:|
| Raspberry PI B+ V1.2 | 46\* | 40\* | - | - | - | - |
| Raspberry PI 2 B V1.1 | 86 | 84 | 188 | 225 | - | - |
| Raspberry PI 2 B V1.1 | 87 | 84 | 192 | 225 | - | - |
| Apple iPhone SE armv7 | 1056\* | 895\* | 2943\* | 2618\* | - | - |
| Apple iPhone SE arm64 | 1061\* | 1239\* | - | - | 4098\* | 3983\* |

Expand Down
88 changes: 66 additions & 22 deletions lib/arch/avx2/enc_loop.c
Original file line number Diff line number Diff line change
@@ -1,3 +1,37 @@
static inline void
enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
{
// First load is done at s - 0 to not get a segfault:
__m256i src = _mm256_loadu_si256((__m256i *) *s);

// Shift by 4 bytes, as required by enc_reshuffle:
src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));

// Reshuffle, translate, store:
src = enc_reshuffle(src);
src = enc_translate(src);
_mm256_storeu_si256((__m256i *) *o, src);

// Subsequent loads will be done at s - 4, set pointer for next round:
*s += 20;
*o += 32;
}

static inline void
enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
{
// Load input:
__m256i src = _mm256_loadu_si256((__m256i *) *s);

// Reshuffle, translate, store:
src = enc_reshuffle(src);
src = enc_translate(src);
_mm256_storeu_si256((__m256i *) *o, src);

*s += 24;
*o += 32;
}

static inline void
enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
Expand All @@ -14,30 +48,40 @@ enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
*slen -= rounds * 24; // 24 bytes consumed per round
*olen += rounds * 32; // 32 bytes produced per round

// First load is done at s - 0 to not get a segfault:
__m256i inputvector = _mm256_loadu_si256((__m256i *) *s);

// Subsequent loads will be done at s - 4, set pointer for next round:
*s += 20;

// Shift by 4 bytes, as required by enc_reshuffle:
inputvector = _mm256_permutevar8x32_epi32(inputvector, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));

for (;;) {

// Reshuffle, translate, store:
inputvector = enc_reshuffle(inputvector);
inputvector = enc_translate(inputvector);
_mm256_storeu_si256((__m256i *) *o, inputvector);
*o += 32;
// The first loop iteration requires special handling to ensure that
// the read, which is done at an offset, does not underflow the buffer:
enc_loop_avx2_inner_first(s, o);
rounds--;

if (--rounds == 0) {
break;
while (rounds > 0) {
if (rounds >= 8) {
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
rounds -= 8;
continue;
}

// Load for the next round:
inputvector = _mm256_loadu_si256((__m256i *) *s);
*s += 24;
if (rounds >= 4) {
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_avx2_inner(s, o);
break;
}

// Add the offset back:
Expand Down
73 changes: 53 additions & 20 deletions lib/arch/generic/32/enc_loop.c
Original file line number Diff line number Diff line change
@@ -1,3 +1,27 @@
static inline void
enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o)
{
uint32_t src;

// Load input:
memcpy(&src, *s, sizeof (src));

// Reorder to 32-bit big-endian, if not already in that format. The
// workset must be in big-endian, otherwise the shifted bits do not
// carry over properly among adjacent bytes:
src = BASE64_HTOBE32(src);

// Shift input by 6 bytes each round and mask in only the lower 6 bits;
// look up the character in the Base64 encoding table and write it to
// the output location:
*(*o)++ = base64_table_enc[(src >> 26) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 20) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 14) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 8) & 0x3F];

*s += 3;
}

static inline void
enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
Expand All @@ -15,25 +39,34 @@ enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
*olen += rounds * 4; // 4 bytes produced per round

do {
uint32_t src;

// Load input:
memcpy(&src, *s, sizeof (src));

// Reorder to 32-bit big-endian, if not already in that format.
// The workset must be in big-endian, otherwise the shifted
// bits do not carry over properly among adjacent bytes:
src = BASE64_HTOBE32(src);

// Shift input by 6 bytes each round and mask in only the lower
// 6 bits; look up the character in the Base64 encoding table
// and write it to the output location:
*(*o)++ = base64_table_enc[(src >> 26) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 20) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 14) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 8) & 0x3F];

*s += 3;
if (rounds >= 8) {
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_generic_32_inner(s, o);
break;

} while (--rounds > 0);
} while (rounds > 0);
}
83 changes: 58 additions & 25 deletions lib/arch/generic/64/enc_loop.c
Original file line number Diff line number Diff line change
@@ -1,3 +1,31 @@
static inline void
enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o)
{
uint64_t src;

// Load input:
memcpy(&src, *s, sizeof (src));

// Reorder to 64-bit big-endian, if not already in that format. The
// workset must be in big-endian, otherwise the shifted bits do not
// carry over properly among adjacent bytes:
src = BASE64_HTOBE64(src);

// Shift input by 6 bytes each round and mask in only the lower 6 bits;
// look up the character in the Base64 encoding table and write it to
// the output location:
*(*o)++ = base64_table_enc[(src >> 58) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 52) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 46) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 40) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 34) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 28) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 22) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 16) & 0x3F];

*s += 6;
}

static inline void
enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
Expand All @@ -15,29 +43,34 @@ enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
*olen += rounds * 8; // 8 bytes produced per round

do {
uint64_t src;

// Load input:
memcpy(&src, *s, sizeof (src));

// Reorder to 64-bit big-endian, if not already in that format.
// The workset must be in big-endian, otherwise the shifted
// bits do not carry over properly among adjacent bytes:
src = BASE64_HTOBE64(src);

// Shift input by 6 bytes each round and mask in only the lower
// 6 bits; look up the character in the Base64 encoding table
// and write it to the output location:
*(*o)++ = base64_table_enc[(src >> 58) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 52) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 46) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 40) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 34) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 28) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 22) & 0x3F];
*(*o)++ = base64_table_enc[(src >> 16) & 0x3F];

*s += 6;

} while (--rounds > 0);
if (rounds >= 8) {
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_generic_64_inner(s, o);
break;

} while (rounds > 0);
}
62 changes: 47 additions & 15 deletions lib/arch/neon32/enc_loop.c
Original file line number Diff line number Diff line change
@@ -1,26 +1,58 @@
static inline void
enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
{
size_t rounds = *slen / 48;
// Load 48 bytes and deinterleave:
uint8x16x3_t src = vld3q_u8(*s);

*slen -= rounds * 48; // 48 bytes consumed per round
*olen += rounds * 64; // 64 bytes produced per round
// Reshuffle:
uint8x16x4_t out = enc_reshuffle(src);

while (rounds-- > 0) {
// Translate reshuffled bytes to the Base64 alphabet:
out = enc_translate(out);

// Load 48 bytes and deinterleave:
uint8x16x3_t src = vld3q_u8(*s);
// Interleave and store output:
vst4q_u8(*o, out);

// Reshuffle:
uint8x16x4_t out = enc_reshuffle(src);
*s += 48;
*o += 64;
}

// Translate reshuffled bytes to the Base64 alphabet:
out = enc_translate(out);
static inline void
enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
size_t rounds = *slen / 48;

// Interleave and store output:
vst4q_u8(*o, out);
*slen -= rounds * 48; // 48 bytes consumed per round
*olen += rounds * 64; // 64 bytes produced per round

*s += 48;
*o += 64;
while (rounds > 0) {
if (rounds >= 8) {
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_neon32_inner(s, o);
break;
}
}
2 changes: 1 addition & 1 deletion lib/arch/neon32/enc_translate.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
static inline uint8x16x4_t
enc_translate (uint8x16x4_t in)
enc_translate (const uint8x16x4_t in)
{
// A lookup table containing the absolute offsets for all ranges:
const uint8x16_t lut = {
Expand Down
Loading

0 comments on commit dd8dc9b

Please sign in to comment.