From 621f2f8eb6cb7425c172985c023ee2a75697a277 Mon Sep 17 00:00:00 2001 From: Kimball Thurston Date: Sat, 5 Oct 2024 14:44:46 +1300 Subject: [PATCH] Apply some minor optimizations to piz decode Signed-off-by: Kimball Thurston --- src/lib/OpenEXRCore/internal_piz.c | 70 ++++++++++++++++++++++++++---- 1 file changed, 62 insertions(+), 8 deletions(-) diff --git a/src/lib/OpenEXRCore/internal_piz.c b/src/lib/OpenEXRCore/internal_piz.c index f330888242..0f2b781aaa 100644 --- a/src/lib/OpenEXRCore/internal_piz.c +++ b/src/lib/OpenEXRCore/internal_piz.c @@ -66,8 +66,14 @@ forwardLutFromBitmap (const uint8_t* bitmap, uint16_t* lut) return k - 1; } +#ifdef __cplusplus +# define NO_ALIAS +#else +# define NO_ALIAS restrict +#endif + static inline uint16_t -reverseLutFromBitmap (const uint8_t* bitmap, uint16_t* lut) +reverseLutFromBitmap (const uint8_t* NO_ALIAS bitmap, uint16_t* NO_ALIAS lut) { uint32_t n, k = 0; @@ -86,8 +92,20 @@ reverseLutFromBitmap (const uint8_t* bitmap, uint16_t* lut) } static inline void -applyLut (const uint16_t* lut, uint16_t* data, uint64_t nData) +applyLut (const uint16_t lut[NO_ALIAS USHORT_RANGE], uint16_t* NO_ALIAS data, uint64_t nData) { + /* partially unrolling is noticeably faster */ + uint16_t dvals[8]; + while ( nData > 8 ) + { + memcpy (dvals, data, sizeof(uint16_t)*8); + for ( int i = 0; i < 8; ++i ) + dvals[i] = lut[dvals[i]]; + memcpy (data, dvals, sizeof(uint16_t)*8); + data += 8; + nData -= 8; + } + for (uint64_t i = 0; i < nData; ++i) data[i] = lut[data[i]]; } @@ -113,14 +131,53 @@ wenc14 (uint16_t a, uint16_t b, uint16_t* l, uint16_t* h) *h = (uint16_t) ds; } +static inline void +wdec14_4 (uint16_t* px, uint16_t* p01, uint16_t* p10, uint16_t* p11) +{ + /* pre swap + * px, p01, p10, p11 + * px -> a + * p10 -> b + * p01 -> c + * p11 -> d + * */ + int16_t a = (int16_t) *px; + int16_t b = (int16_t) *p10; + int16_t c = (int16_t) *p01; + int16_t d = (int16_t) *p11; + + int ai = (int) a; + int bi = (int) b; + int ci = (int) c; + int di = (int) d; + + int i00 = ai + (bi & 1) + (bi >> 1); + int i10 = i00 - bi; + int i01 = ci + (di & 1) + (di >> 1); + int i11 = i01 - di; + + ai = i00 + (i01 & 1) + (i01 >> 1); + bi = ai - i01; + ci = i10 + (i11 & 1) + (i11 >> 1); + di = ci - i11; + + /* different output order */ + /* px, p01, p10, p11 */ + *px = (uint16_t) ai; + *p01 = (uint16_t) bi; + *p10 = (uint16_t) ci; + *p11 = (uint16_t) di; +} + static inline void wdec14 (uint16_t l, uint16_t h, uint16_t* a, uint16_t* b) { int16_t ls = (int16_t) l; int16_t hs = (int16_t) h; - int hi = hs; - int ai = ls + (hi & 1) + (hi >> 1); + int hi = (int) hs; + int li = (int) ls; + int ai = li + (hi & 1) + (hi >> 1); int16_t as = (int16_t) ai; int16_t bs = (int16_t) (ai - hi); @@ -339,10 +396,7 @@ wav_2D_decode ( if (w14) { - wdec14 (*px, *p10, &i00, &i10); - wdec14 (*p01, *p11, &i01, &i11); - wdec14 (i00, i01, px, p01); - wdec14 (i10, i11, p10, p11); + wdec14_4 (px, p01, p10, p11); } else {