Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NEON optimizations for ZIP reading (reconstruct and interleave) #1348

Merged
merged 1 commit into from
Mar 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions src/lib/OpenEXR/ImfZip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,56 @@ reconstruct_sse41 (char* buf, size_t outSize)

#endif

#ifdef IMF_HAVE_NEON

void
reconstruct_neon (char* buf, size_t outSize)
{
static const size_t bytesPerChunk = sizeof (uint8x16_t);
const size_t vOutSize = outSize / bytesPerChunk;

const uint8x16_t c = vdupq_n_u8 (-128);
const uint8x16_t shuffleMask = vdupq_n_u8 (15);

// The first element doesn't have its high bit flipped during compression,
// so it must not be flipped here. To make the SIMD loop nice and
// uniform, we pre-flip the bit so that the loop will unflip it again.
buf[0] += -128;

unsigned char* vBuf = reinterpret_cast<unsigned char*> (buf);
uint8x16_t vZero = vdupq_n_u8 (0);
uint8x16_t vPrev = vdupq_n_u8 (0);
for (size_t i = 0; i < vOutSize; ++i)
{
uint8x16_t d = vaddq_u8 (vld1q_u8 (vBuf), c);

// Compute the prefix sum of elements.
d = vaddq_u8 (d, vextq_u8 (vZero, d, 16 - 1));
d = vaddq_u8 (d, vextq_u8 (vZero, d, 16 - 2));
d = vaddq_u8 (d, vextq_u8 (vZero, d, 16 - 4));
d = vaddq_u8 (d, vextq_u8 (vZero, d, 16 - 8));
d = vaddq_u8 (d, vPrev);

vst1q_u8 (vBuf, d);
vBuf += sizeof (uint8x16_t);

// Broadcast the high byte in our result to all lanes of the prev
// value for the next iteration.
vPrev = vqtbl1q_u8 (d, shuffleMask);
}

unsigned char prev = vgetq_lane_u8 (vPrev, 15);
for (size_t i = vOutSize * bytesPerChunk; i < outSize; ++i)
{
unsigned char d = prev + buf[i] - 128;
buf[i] = d;
prev = d;
}
}

#endif


void
reconstruct_scalar (char* buf, size_t outSize)
{
Expand Down Expand Up @@ -212,6 +262,44 @@ interleave_sse2 (const char* source, size_t outSize, char* out)

#endif

#ifdef IMF_HAVE_NEON

void
interleave_neon (const char* source, size_t outSize, char* out)
{
static const size_t bytesPerChunk = 2 * sizeof (uint8x16_t);

const size_t vOutSize = outSize / bytesPerChunk;

const unsigned char* v1 = reinterpret_cast<const unsigned char*> (source);
const unsigned char* v2 =
reinterpret_cast<const unsigned char*> (source + (outSize + 1) / 2);
unsigned char* vOut = reinterpret_cast<unsigned char*> (out);

for (size_t i = 0; i < vOutSize; ++i)
{
uint8x16_t a = vld1q_u8 (v1); v1 += sizeof (uint8x16_t);
uint8x16_t b = vld1q_u8 (v2); v2 += sizeof (uint8x16_t);

uint8x16_t lo = vzip1q_u8 (a, b);
uint8x16_t hi = vzip2q_u8 (a, b);

vst1q_u8 (vOut, lo); vOut += sizeof (uint8x16_t);
vst1q_u8 (vOut, hi); vOut += sizeof (uint8x16_t);
}

const char* t1 = reinterpret_cast<const char*> (v1);
const char* t2 = reinterpret_cast<const char*> (v2);
char* sOut = reinterpret_cast<char*> (vOut);

for (size_t i = vOutSize * bytesPerChunk; i < outSize; ++i)
{
*(sOut++) = (i % 2 == 0) ? *(t1++) : *(t2++);
}
}

#endif

void
interleave_scalar (const char* source, size_t outSize, char* out)
{
Expand Down Expand Up @@ -291,6 +379,11 @@ Zip::initializeFuncs ()
interleave = interleave_sse2;
}
#endif

#ifdef IMF_HAVE_NEON
reconstruct = reconstruct_neon;
interleave = interleave_neon;
#endif
}

OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_EXIT
77 changes: 77 additions & 0 deletions src/lib/OpenEXRCore/internal_zip.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@
# define IMF_HAVE_SSE4_1 1
# include <smmintrin.h>
#endif
#if defined(__ARM_NEON)
# define IMF_HAVE_NEON 1
# include <arm_neon.h>
#endif


/**************************************/

Expand Down Expand Up @@ -73,6 +78,54 @@ reconstruct (uint8_t* buf, uint64_t outSize)
prev = d;
}
}
#elif defined(IMF_HAVE_NEON)
static void
reconstruct (uint8_t* buf, uint64_t outSize)
{
static const uint64_t bytesPerChunk = sizeof (uint8x16_t);
const uint64_t vOutSize = outSize / bytesPerChunk;
const uint8x16_t c = vdupq_n_u8 (-128);
const uint8x16_t shuffleMask = vdupq_n_u8 (15);
const uint8x16_t zero = vdupq_n_u8 (0);
uint8_t * vBuf;
uint8x16_t vPrev;
uint8_t prev;

/*
* The first element doesn't have its high bit flipped during compression,
* so it must not be flipped here. To make the SIMD loop nice and
* uniform, we pre-flip the bit so that the loop will unflip it again.
*/
buf[0] += -128;
vBuf = buf;
vPrev = vdupq_n_u8 (0);

for (uint64_t i = 0; i < vOutSize; ++i)
{
uint8x16_t d = vaddq_u8 (vld1q_u8 (vBuf), c);

/* Compute the prefix sum of elements. */
d = vaddq_u8 (d, vextq_u8 (zero, d, 16 - 1));
d = vaddq_u8 (d, vextq_u8 (zero, d, 16 - 2));
d = vaddq_u8 (d, vextq_u8 (zero, d, 16 - 4));
d = vaddq_u8 (d, vextq_u8 (zero, d, 16 - 8));
d = vaddq_u8 (d, vPrev);

vst1q_u8 (vBuf, d); vBuf += sizeof (uint8x16_t);

// Broadcast the high byte in our result to all lanes of the prev
// value for the next iteration.
vPrev = vqtbl1q_u8 (d, shuffleMask);
}

prev = vgetq_lane_u8 (vPrev, 15);
for (uint64_t i = vOutSize * bytesPerChunk; i < outSize; ++i)
{
uint8_t d = prev + buf[i] - 128;
buf[i] = d;
prev = d;
}
}
#else
static void
reconstruct (uint8_t* buf, uint64_t sz)
Expand Down Expand Up @@ -121,6 +174,30 @@ interleave (uint8_t* out, const uint8_t* source, uint64_t outSize)
*(sOut++) = (i % 2 == 0) ? *(t1++) : *(t2++);
}

#elif defined(IMF_HAVE_NEON)
static void
interleave (uint8_t* out, const uint8_t* source, uint64_t outSize)
{
static const uint64_t bytesPerChunk = 2 * sizeof (uint8x16_t);
const uint64_t vOutSize = outSize / bytesPerChunk;
const uint8_t* v1 = source;
const uint8_t* v2 = source + (outSize + 1) / 2;

for (uint64_t i = 0; i < vOutSize; ++i)
{
uint8x16_t a = vld1q_u8 (v1); v1 += sizeof (uint8x16_t);
uint8x16_t b = vld1q_u8 (v2); v2 += sizeof (uint8x16_t);
uint8x16_t lo = vzip1q_u8 (a, b);
uint8x16_t hi = vzip2q_u8 (a, b);

vst1q_u8 (out, lo); out += sizeof (uint8x16_t);
vst1q_u8 (out, hi); out += sizeof (uint8x16_t);
}

for (uint64_t i = vOutSize * bytesPerChunk; i < outSize; ++i)
*(out++) = (i % 2 == 0) ? *(v1++) : *(v2++);
}

#else

static void
Expand Down