From 4ea2af9c1d4c4126a8b3166510e6d3eed791615a Mon Sep 17 00:00:00 2001 From: Kimball Thurston Date: Sat, 5 Oct 2024 14:44:26 +1300 Subject: [PATCH] Enable fast huf for os/x, optimize routines Signed-off-by: Kimball Thurston --- src/lib/OpenEXRCore/internal_huf.c | 270 +++++++++++++++-------------- 1 file changed, 138 insertions(+), 132 deletions(-) diff --git a/src/lib/OpenEXRCore/internal_huf.c b/src/lib/OpenEXRCore/internal_huf.c index 21f554c4d6..9c5f1795b4 100644 --- a/src/lib/OpenEXRCore/internal_huf.c +++ b/src/lib/OpenEXRCore/internal_huf.c @@ -8,6 +8,7 @@ #include "internal_memory.h" #include "internal_xdr.h" #include "internal_structs.h" +#include "internal_coding.h" #include #include @@ -1001,37 +1002,21 @@ readUInt (const uint8_t* b) // Number of bits in our acceleration table. Should match all // codes up to TABLE_LOOKUP_BITS in length. -#define TABLE_LOOKUP_BITS 14 -#include +// old c++ code was only 12 bits, not 14 as you might expect from the encode +// but only having a memory page for the lookup does seem marginally faster +#define TABLE_LOOKUP_BITS 12 +//#define TABLE_LOOKUP_BITS HUF_DECBITS -#ifdef __APPLE__ -# include -# define SWAP64(c) OSSwapInt64 (c) -#elif defined(linux) -# include -# define SWAP64(c) bswap_64 (c) -#elif defined(_MSC_VER) -# include -# define SWAP64(c) _byteswap_uint64 (c) -#endif +#include -#ifdef SWAP64 -static inline uint64_t -READ64 (const uint8_t* src) -{ - uint64_t v; - // unaligned reads are UB - memcpy (&v, src, sizeof (uint64_t)); - return SWAP64 (v); -} -#else +/* unaligned reads are UB, so attempts to work around that result in + * slower code than just doing the simple thing */ # define READ64(c) \ ((uint64_t) (c)[0] << 56) | ((uint64_t) (c)[1] << 48) | \ ((uint64_t) (c)[2] << 40) | ((uint64_t) (c)[3] << 32) | \ ((uint64_t) (c)[4] << 24) | ((uint64_t) (c)[5] << 16) | \ ((uint64_t) (c)[6] << 8) | ((uint64_t) (c)[7]) -#endif typedef struct FastHufDecoder { @@ -1047,11 +1032,10 @@ typedef struct FastHufDecoder uint8_t _maxCodeLength; // Maximum code length, in bits. uint8_t _pad[2]; - int _idToSymbol[65536 + 1]; // Maps Ids to symbols. Ids are a symbol - // ordering sorted first in terms of - // code length, and by code within - // the same length. Ids run from 0 - // to mNumSymbols-1. + // Maps Ids to symbols. Ids are a symbol ordering sorted first in + // terms of code length, and by code within the same length. Ids + // run from 0 to mNumSymbols-1. + uint32_t _idToSymbol[65536 + 1]; uint64_t _ljBase[MAX_CODE_LEN + 1 + 1]; // the 'left justified base' table. // Takes base[i] (i = code length) @@ -1073,8 +1057,9 @@ typedef struct FastHufDecoder // a symbol to indicate RLE. So with a dense code book, we could // have 2^16+1 codes, hence 'symbol' could be bigger than 16 bits. // - int _lookupSymbol - [1 << TABLE_LOOKUP_BITS]; /* value = (codeLen << 24) | symbol */ + // value = (codeLen << 24) | symbol + // rather than storing two tables + uint32_t _lookupSymbol[1 << TABLE_LOOKUP_BITS]; uint64_t _tableMin; } FastHufDecoder; @@ -1179,77 +1164,87 @@ FastHufDecoder_buildTables ( return EXR_ERR_SUCCESS; } +#ifdef __cplusplus +# define NO_ALIAS +#else +# define NO_ALIAS restrict +#endif + static inline void FastHufDecoder_refill ( - uint64_t* buffer, - int numBits, // number of bits to refill - uint64_t* bufferBack, // the next 64-bits, to refill from - int* bufferBackNumBits, // number of bits left in bufferBack - const uint8_t** currByte, // current byte in the bitstream - uint64_t* currBitsLeft) + uint64_t* NO_ALIAS buffer, + int numBits, // number of bits to refill + uint64_t* NO_ALIAS bufferBack, // the next 64-bits, to refill from + int* bufferBackNumBits, // number of bits left in bufferBack + const uint8_t* NO_ALIAS * NO_ALIAS currByte, // current byte in the bitstream + uint64_t* NO_ALIAS currBitsLeft) { - // - // Refill bits into the bottom of buffer, from the top of bufferBack. - // Always top up buffer to be completely full. - // - - *buffer |= (*bufferBack) >> (64 - numBits); + int needBits; - if (*bufferBackNumBits < numBits) +#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) + if (__builtin_expect ((numBits == 0), 0)) +#else + if (numBits == 0) +#endif { - numBits -= *bufferBackNumBits; - - // - // Refill all of bufferBack from the bitstream. Either grab - // a full 64-bit chunk, or whatever bytes are left. If we - // don't have 64-bits left, pad with 0's. - // + *buffer = *bufferBack; + numBits = *bufferBackNumBits; + *bufferBackNumBits = 0; + } - if (*currBitsLeft >= 64) - { - *bufferBack = READ64 (*currByte); - *bufferBackNumBits = 64; - *currByte += sizeof (uint64_t); - *currBitsLeft -= 8 * sizeof (uint64_t); - } - else + while (numBits < 64) + { + if (*bufferBackNumBits <= 0) { - uint64_t shift = 56; +#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) + if (__builtin_expect ((*currBitsLeft >= 64), 1)) +#else + if (*currBitsLeft >= 64) +#endif + { + *bufferBack = READ64 (*currByte); + *bufferBackNumBits = 64; + *currByte += sizeof (uint64_t); + *currBitsLeft -= 8 * sizeof (uint64_t); + } + else + { + uint64_t shift = 56; - *bufferBack = 0; - *bufferBackNumBits = 64; + *bufferBack = 0; + *bufferBackNumBits = 64; - while (*currBitsLeft >= 8) - { - *bufferBack |= ((uint64_t) (**currByte)) << shift; + while (*currBitsLeft >= 8) + { + *bufferBack |= ((uint64_t) (**currByte)) << shift; - (*currByte)++; - shift -= 8; - *currBitsLeft -= 8; - } + (*currByte)++; + shift -= 8; + *currBitsLeft -= 8; + } - if (*currBitsLeft > 0) - { - *bufferBack |= ((uint64_t) (**currByte)) << shift; + if (*currBitsLeft > 0) + { + *bufferBack |= ((uint64_t) (**currByte)) << shift; - (*currByte)++; - shift -= 8; - *currBitsLeft = 0; + (*currByte)++; + shift -= 8; + *currBitsLeft = 0; + } } } - *buffer |= (*bufferBack) >> (64 - numBits); + needBits = 64 - numBits; + *buffer |= (*bufferBack) >> numBits; + if (*bufferBackNumBits >= needBits) + { + *bufferBack <<= needBits; + *bufferBackNumBits -= needBits; + break; + } + numBits += *bufferBackNumBits; + *bufferBackNumBits = 0; } - - // - // We can have cases where the previous shift of bufferBack is << 64 - - // this is an undefined operation but tends to create just zeroes. - // so if we won't have any bits left, zero out bufferBack instead of computing the shift - // - - if (*bufferBackNumBits <= numBits) { *bufferBack = 0; } - else { *bufferBack = (*bufferBack) << numBits; } - *bufferBackNumBits -= numBits; } static inline uint64_t @@ -1391,7 +1386,7 @@ fasthuf_initialize ( for (int i = 0; i < MAX_CODE_LEN; ++i) fhd->_numSymbols += (uint32_t) codeCount[i]; - if ((size_t) fhd->_numSymbols > sizeof (fhd->_idToSymbol) / sizeof (int)) + if ((size_t) fhd->_numSymbols > sizeof (fhd->_idToSymbol) / sizeof (uint32_t)) { if (pctxt) pctxt->print_error ( @@ -1476,7 +1471,7 @@ fasthuf_initialize ( "Huffman decode error (Invalid symbol in header)"); return EXR_ERR_CORRUPT_CHUNK; } - fhd->_idToSymbol[mapping[codeLen]] = (int) symbol; + fhd->_idToSymbol[mapping[codeLen]] = (uint32_t) symbol; mapping[codeLen]++; } else if (codeLen == (uint64_t) LONG_ZEROCODE_RUN) @@ -1495,7 +1490,12 @@ fasthuf_initialize ( static inline int fasthuf_decode_enabled (void) { -#if defined(__INTEL_COMPILER) || defined(__GNUC__) + // + // Enabled for clang on Apple platforms (tested) + // +#if defined(__APPLE__) && defined(__clang__) + return 1; +#elif defined(__INTEL_COMPILER) || defined(__GNUC__) // // Enabled for ICC, GCC: @@ -1533,22 +1533,23 @@ fasthuf_decode_enabled (void) static exr_result_t fasthuf_decode ( - exr_const_context_t pctxt, - FastHufDecoder* fhd, - const uint8_t* src, - uint64_t numSrcBits, - uint16_t* dst, - uint64_t numDstElems) + exr_const_context_t pctxt, + FastHufDecoder* NO_ALIAS fhd, + const uint8_t* NO_ALIAS src, + uint64_t numSrcBits, + uint16_t* NO_ALIAS dst, + uint64_t numDstElems) { // // Current position (byte/bit) in the src data stream // (after the first buffer fill) // - uint64_t buffer, bufferBack, dstIdx; + uint64_t buffer, bufferBack, dstIdx, tMin; int bufferNumBits, bufferBackNumBits; + uint32_t rleSym; const unsigned char* currByte = src + 2 * sizeof (uint64_t); - numSrcBits -= 8 * 2 * (int) sizeof (uint64_t); + numSrcBits -= 8 * 2 * sizeof (uint64_t); // // 64-bit buffer holding the current bits in the stream @@ -1565,11 +1566,21 @@ fasthuf_decode ( bufferBackNumBits = 64; dstIdx = 0; + tMin = fhd->_tableMin; + rleSym = fhd->_rleSymbol; + while (dstIdx < numDstElems) { - int codeLen; - int symbol; - int rleCount; + uint32_t symbol; + + FastHufDecoder_refill ( + &buffer, + bufferNumBits, + &bufferBack, + &bufferBackNumBits, + &currByte, + &numSrcBits); + bufferNumBits = 64; // // Test if we can be table accelerated. If so, directly @@ -1581,9 +1592,9 @@ fasthuf_decode ( // left. But for a search, we do need a refilled table. // - if (fhd->_tableMin <= buffer) + if (tMin <= buffer) { - int tableIdx = + uint32_t combinedSymbol = fhd->_lookupSymbol[buffer >> (64 - TABLE_LOOKUP_BITS)]; // @@ -1594,11 +1605,22 @@ fasthuf_decode ( // if (codeLen > _maxCodeLength) in this inner. // - codeLen = tableIdx >> 24; - symbol = tableIdx & 0xffffff; + symbol = combinedSymbol & 0xffffff; +// uint32_t codeLen; +// codeLen = combinedSymbol >> 24; +// buffer = buffer << codeLen; +// bufferNumBits -= codeLen; + + // + // Shift over bit stream, and update the bit count in the buffer + // + combinedSymbol >>= 24; + buffer <<= combinedSymbol; + bufferNumBits -= combinedSymbol; } else { + uint32_t codeLen; uint64_t id; // // Brute force search: @@ -1636,14 +1658,14 @@ fasthuf_decode ( "Huffman decode error (Decoded an invalid symbol)"); return EXR_ERR_CORRUPT_CHUNK; } - } - // - // Shift over bit stream, and update the bit count in the buffer - // + // + // Shift over bit stream, and update the bit count in the buffer + // - buffer = buffer << codeLen; - bufferNumBits -= codeLen; + buffer = buffer << codeLen; + bufferNumBits -= codeLen; + } // // If we received a RLE symbol (_rleSymbol), then we need @@ -1652,13 +1674,15 @@ fasthuf_decode ( // 8 bits of data in the buffer // - if (symbol == fhd->_rleSymbol) + if (symbol == rleSym) { + uint32_t rleCount; + if (bufferNumBits < 8) { FastHufDecoder_refill ( &buffer, - 64 - bufferNumBits, + bufferNumBits, &bufferBack, &bufferBackNumBits, &currByte, @@ -1689,7 +1713,7 @@ fasthuf_decode ( return EXR_ERR_CORRUPT_CHUNK; } - if (rleCount <= 0) + if (rleCount == 0 || rleCount >= (uint32_t)INT32_MAX) { if (pctxt) pctxt->print_error ( @@ -1699,10 +1723,10 @@ fasthuf_decode ( return EXR_ERR_CORRUPT_CHUNK; } - for (int i = 0; i < rleCount; ++i) + for (uint32_t i = 0; i < rleCount; ++i) dst[dstIdx + (uint64_t) i] = dst[dstIdx - 1]; - dstIdx += (uint64_t) rleCount; + dstIdx += rleCount; buffer = buffer << 8; bufferNumBits -= 8; @@ -1712,24 +1736,6 @@ fasthuf_decode ( dst[dstIdx] = (uint16_t) symbol; dstIdx++; } - - // - // refill bit stream buffer if we're below the number of - // bits needed for a table lookup - // - - if (bufferNumBits < 64) - { - FastHufDecoder_refill ( - &buffer, - 64 - bufferNumBits, - &bufferBack, - &bufferBackNumBits, - &currByte, - &numSrcBits); - - bufferNumBits = 64; - } } if (numSrcBits != 0)