From 4ea2af9c1d4c4126a8b3166510e6d3eed791615a Mon Sep 17 00:00:00 2001
From: Kimball Thurston <kdt3rd@gmail.com>
Date: Sat, 5 Oct 2024 14:44:26 +1300
Subject: [PATCH] Enable fast huf for os/x, optimize routines

Signed-off-by: Kimball Thurston <kdt3rd@gmail.com>
---
 src/lib/OpenEXRCore/internal_huf.c | 270 +++++++++++++++--------------
 1 file changed, 138 insertions(+), 132 deletions(-)

diff --git a/src/lib/OpenEXRCore/internal_huf.c b/src/lib/OpenEXRCore/internal_huf.c
index 21f554c4d6..9c5f1795b4 100644
--- a/src/lib/OpenEXRCore/internal_huf.c
+++ b/src/lib/OpenEXRCore/internal_huf.c
@@ -8,6 +8,7 @@
 #include "internal_memory.h"
 #include "internal_xdr.h"
 #include "internal_structs.h"
+#include "internal_coding.h"
 
 #include <stddef.h>
 #include <stdint.h>
@@ -1001,37 +1002,21 @@ readUInt (const uint8_t* b)
 
 // Number of bits in our acceleration table. Should match all
 // codes up to TABLE_LOOKUP_BITS in length.
-#define TABLE_LOOKUP_BITS 14
 
-#include <inttypes.h>
+// old c++ code was only 12 bits, not 14 as you might expect from the encode
+// but only having a memory page for the lookup does seem marginally faster
+#define TABLE_LOOKUP_BITS 12
+//#define TABLE_LOOKUP_BITS HUF_DECBITS
 
-#ifdef __APPLE__
-#    include <libkern/OSByteOrder.h>
-#    define SWAP64(c) OSSwapInt64 (c)
-#elif defined(linux)
-#    include <byteswap.h>
-#    define SWAP64(c) bswap_64 (c)
-#elif defined(_MSC_VER)
-#    include <stdlib.h>
-#    define SWAP64(c) _byteswap_uint64 (c)
-#endif
+#include <inttypes.h>
 
-#ifdef SWAP64
-static inline uint64_t
-READ64 (const uint8_t* src)
-{
-    uint64_t v;
-    // unaligned reads are UB
-    memcpy (&v, src, sizeof (uint64_t));
-    return SWAP64 (v);
-}
-#else
+/* unaligned reads are UB, so attempts to work around that result in
+ * slower code than just doing the simple thing */
 #    define READ64(c)                                                          \
         ((uint64_t) (c)[0] << 56) | ((uint64_t) (c)[1] << 48) |                \
             ((uint64_t) (c)[2] << 40) | ((uint64_t) (c)[3] << 32) |            \
             ((uint64_t) (c)[4] << 24) | ((uint64_t) (c)[5] << 16) |            \
             ((uint64_t) (c)[6] << 8) | ((uint64_t) (c)[7])
-#endif
 
 typedef struct FastHufDecoder
 {
@@ -1047,11 +1032,10 @@ typedef struct FastHufDecoder
     uint8_t _maxCodeLength; // Maximum code length, in bits.
     uint8_t _pad[2];
 
-    int _idToSymbol[65536 + 1]; // Maps Ids to symbols. Ids are a symbol
-                                // ordering sorted first in terms of
-                                // code length, and by code within
-                                // the same length. Ids run from 0
-                                // to mNumSymbols-1.
+    // Maps Ids to symbols. Ids are a symbol ordering sorted first in
+    // terms of code length, and by code within the same length. Ids
+    // run from 0 to mNumSymbols-1.
+    uint32_t _idToSymbol[65536 + 1];
 
     uint64_t _ljBase[MAX_CODE_LEN + 1 + 1]; // the 'left justified base' table.
                                             // Takes base[i] (i = code length)
@@ -1073,8 +1057,9 @@ typedef struct FastHufDecoder
     // a symbol to indicate RLE. So with a dense code book, we could
     // have 2^16+1 codes, hence 'symbol' could  be bigger than 16 bits.
     //
-    int _lookupSymbol
-        [1 << TABLE_LOOKUP_BITS]; /* value = (codeLen << 24) | symbol */
+    // value = (codeLen << 24) | symbol
+    // rather than storing two tables
+    uint32_t _lookupSymbol[1 << TABLE_LOOKUP_BITS];
 
     uint64_t _tableMin;
 } FastHufDecoder;
@@ -1179,77 +1164,87 @@ FastHufDecoder_buildTables (
     return EXR_ERR_SUCCESS;
 }
 
+#ifdef __cplusplus
+#    define NO_ALIAS
+#else
+#    define NO_ALIAS restrict
+#endif
+
 static inline void
 FastHufDecoder_refill (
-    uint64_t*       buffer,
-    int             numBits,           // number of bits to refill
-    uint64_t*       bufferBack,        // the next 64-bits, to refill from
-    int*            bufferBackNumBits, // number of bits left in bufferBack
-    const uint8_t** currByte,          // current byte in the bitstream
-    uint64_t*       currBitsLeft)
+    uint64_t* NO_ALIAS                 buffer,
+    int                                numBits,           // number of bits to refill
+    uint64_t* NO_ALIAS                 bufferBack,        // the next 64-bits, to refill from
+    int*                               bufferBackNumBits, // number of bits left in bufferBack
+    const uint8_t* NO_ALIAS * NO_ALIAS currByte,          // current byte in the bitstream
+    uint64_t* NO_ALIAS                 currBitsLeft)
 {
-    //
-    // Refill bits into the bottom of buffer, from the top of bufferBack.
-    // Always top up buffer to be completely full.
-    //
-
-    *buffer |= (*bufferBack) >> (64 - numBits);
+    int needBits;
 
-    if (*bufferBackNumBits < numBits)
+#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
+    if (__builtin_expect ((numBits == 0), 0))
+#else
+    if (numBits == 0)
+#endif
     {
-        numBits -= *bufferBackNumBits;
-
-        //
-        // Refill all of bufferBack from the bitstream. Either grab
-        // a full 64-bit chunk, or whatever bytes are left. If we
-        // don't have 64-bits left, pad with 0's.
-        //
+        *buffer = *bufferBack;
+        numBits = *bufferBackNumBits;
+        *bufferBackNumBits = 0;
+    }
 
-        if (*currBitsLeft >= 64)
-        {
-            *bufferBack        = READ64 (*currByte);
-            *bufferBackNumBits = 64;
-            *currByte += sizeof (uint64_t);
-            *currBitsLeft -= 8 * sizeof (uint64_t);
-        }
-        else
+    while (numBits < 64)
+    {
+        if (*bufferBackNumBits <= 0)
         {
-            uint64_t shift = 56;
+#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
+            if (__builtin_expect ((*currBitsLeft >= 64), 1))
+#else
+            if (*currBitsLeft >= 64)
+#endif
+            {
+                *bufferBack        = READ64 (*currByte);
+                *bufferBackNumBits = 64;
+                *currByte += sizeof (uint64_t);
+                *currBitsLeft -= 8 * sizeof (uint64_t);
+            }
+            else
+            {
+                uint64_t shift = 56;
 
-            *bufferBack        = 0;
-            *bufferBackNumBits = 64;
+                *bufferBack        = 0;
+                *bufferBackNumBits = 64;
 
-            while (*currBitsLeft >= 8)
-            {
-                *bufferBack |= ((uint64_t) (**currByte)) << shift;
+                while (*currBitsLeft >= 8)
+                {
+                    *bufferBack |= ((uint64_t) (**currByte)) << shift;
 
-                (*currByte)++;
-                shift -= 8;
-                *currBitsLeft -= 8;
-            }
+                    (*currByte)++;
+                    shift -= 8;
+                    *currBitsLeft -= 8;
+                }
 
-            if (*currBitsLeft > 0)
-            {
-                *bufferBack |= ((uint64_t) (**currByte)) << shift;
+                if (*currBitsLeft > 0)
+                {
+                    *bufferBack |= ((uint64_t) (**currByte)) << shift;
 
-                (*currByte)++;
-                shift -= 8;
-                *currBitsLeft = 0;
+                    (*currByte)++;
+                    shift -= 8;
+                    *currBitsLeft = 0;
+                }
             }
         }
 
-        *buffer |= (*bufferBack) >> (64 - numBits);
+        needBits = 64 - numBits;
+        *buffer |= (*bufferBack) >> numBits;
+        if (*bufferBackNumBits >= needBits)
+        {
+            *bufferBack <<= needBits;
+            *bufferBackNumBits -= needBits;
+            break;
+        }
+        numBits += *bufferBackNumBits;
+        *bufferBackNumBits = 0;
     }
-
-    //
-    // We can have cases where the previous shift of bufferBack is << 64 -
-    // this is an undefined operation but tends to create just zeroes.
-    // so if we won't have any bits left, zero out bufferBack instead of computing the shift
-    //
-
-    if (*bufferBackNumBits <= numBits) { *bufferBack = 0; }
-    else { *bufferBack = (*bufferBack) << numBits; }
-    *bufferBackNumBits -= numBits;
 }
 
 static inline uint64_t
@@ -1391,7 +1386,7 @@ fasthuf_initialize (
     for (int i = 0; i < MAX_CODE_LEN; ++i)
         fhd->_numSymbols += (uint32_t) codeCount[i];
 
-    if ((size_t) fhd->_numSymbols > sizeof (fhd->_idToSymbol) / sizeof (int))
+    if ((size_t) fhd->_numSymbols > sizeof (fhd->_idToSymbol) / sizeof (uint32_t))
     {
         if (pctxt)
             pctxt->print_error (
@@ -1476,7 +1471,7 @@ fasthuf_initialize (
                         "Huffman decode error (Invalid symbol in header)");
                 return EXR_ERR_CORRUPT_CHUNK;
             }
-            fhd->_idToSymbol[mapping[codeLen]] = (int) symbol;
+            fhd->_idToSymbol[mapping[codeLen]] = (uint32_t) symbol;
             mapping[codeLen]++;
         }
         else if (codeLen == (uint64_t) LONG_ZEROCODE_RUN)
@@ -1495,7 +1490,12 @@ fasthuf_initialize (
 static inline int
 fasthuf_decode_enabled (void)
 {
-#if defined(__INTEL_COMPILER) || defined(__GNUC__)
+    //
+    // Enabled for clang on Apple platforms (tested)
+    //
+#if defined(__APPLE__) && defined(__clang__)
+    return 1;
+#elif defined(__INTEL_COMPILER) || defined(__GNUC__)
 
     //
     // Enabled for ICC, GCC:
@@ -1533,22 +1533,23 @@ fasthuf_decode_enabled (void)
 
 static exr_result_t
 fasthuf_decode (
-    exr_const_context_t pctxt,
-    FastHufDecoder*     fhd,
-    const uint8_t*      src,
-    uint64_t            numSrcBits,
-    uint16_t*           dst,
-    uint64_t            numDstElems)
+    exr_const_context_t         pctxt,
+    FastHufDecoder* NO_ALIAS    fhd,
+    const uint8_t* NO_ALIAS     src,
+    uint64_t                    numSrcBits,
+    uint16_t* NO_ALIAS          dst,
+    uint64_t                    numDstElems)
 {
     //
     // Current position (byte/bit) in the src data stream
     // (after the first buffer fill)
     //
-    uint64_t             buffer, bufferBack, dstIdx;
+    uint64_t             buffer, bufferBack, dstIdx, tMin;
     int                  bufferNumBits, bufferBackNumBits;
+    uint32_t             rleSym;
     const unsigned char* currByte = src + 2 * sizeof (uint64_t);
 
-    numSrcBits -= 8 * 2 * (int) sizeof (uint64_t);
+    numSrcBits -= 8 * 2 * sizeof (uint64_t);
 
     //
     // 64-bit buffer holding the current bits in the stream
@@ -1565,11 +1566,21 @@ fasthuf_decode (
     bufferBackNumBits = 64;
     dstIdx            = 0;
 
+    tMin = fhd->_tableMin;
+    rleSym = fhd->_rleSymbol;
+
     while (dstIdx < numDstElems)
     {
-        int codeLen;
-        int symbol;
-        int rleCount;
+        uint32_t symbol;
+
+        FastHufDecoder_refill (
+            &buffer,
+            bufferNumBits,
+            &bufferBack,
+            &bufferBackNumBits,
+            &currByte,
+            &numSrcBits);
+        bufferNumBits = 64;
 
         //
         // Test if we can be table accelerated. If so, directly
@@ -1581,9 +1592,9 @@ fasthuf_decode (
         // left. But for a search, we do need a refilled table.
         //
 
-        if (fhd->_tableMin <= buffer)
+        if (tMin <= buffer)
         {
-            int tableIdx =
+            uint32_t combinedSymbol =
                 fhd->_lookupSymbol[buffer >> (64 - TABLE_LOOKUP_BITS)];
 
             //
@@ -1594,11 +1605,22 @@ fasthuf_decode (
             //     if (codeLen > _maxCodeLength) in this inner.
             //
 
-            codeLen = tableIdx >> 24;
-            symbol  = tableIdx & 0xffffff;
+            symbol  = combinedSymbol & 0xffffff;
+//            uint32_t codeLen;
+//            codeLen = combinedSymbol >> 24;
+//            buffer = buffer << codeLen;
+//            bufferNumBits -= codeLen;
+
+            //
+            // Shift over bit stream, and update the bit count in the buffer
+            //
+            combinedSymbol >>= 24;
+            buffer <<= combinedSymbol;
+            bufferNumBits -= combinedSymbol;
         }
         else
         {
+            uint32_t codeLen;
             uint64_t id;
             //
             // Brute force search:
@@ -1636,14 +1658,14 @@ fasthuf_decode (
                         "Huffman decode error (Decoded an invalid symbol)");
                 return EXR_ERR_CORRUPT_CHUNK;
             }
-        }
 
-        //
-        // Shift over bit stream, and update the bit count in the buffer
-        //
+            //
+            // Shift over bit stream, and update the bit count in the buffer
+            //
 
-        buffer = buffer << codeLen;
-        bufferNumBits -= codeLen;
+            buffer = buffer << codeLen;
+            bufferNumBits -= codeLen;
+        }
 
         //
         // If we received a RLE symbol (_rleSymbol), then we need
@@ -1652,13 +1674,15 @@ fasthuf_decode (
         // 8 bits of data in the buffer
         //
 
-        if (symbol == fhd->_rleSymbol)
+        if (symbol == rleSym)
         {
+            uint32_t rleCount;
+
             if (bufferNumBits < 8)
             {
                 FastHufDecoder_refill (
                     &buffer,
-                    64 - bufferNumBits,
+                    bufferNumBits,
                     &bufferBack,
                     &bufferBackNumBits,
                     &currByte,
@@ -1689,7 +1713,7 @@ fasthuf_decode (
                 return EXR_ERR_CORRUPT_CHUNK;
             }
 
-            if (rleCount <= 0)
+            if (rleCount == 0 || rleCount >= (uint32_t)INT32_MAX)
             {
                 if (pctxt)
                     pctxt->print_error (
@@ -1699,10 +1723,10 @@ fasthuf_decode (
                 return EXR_ERR_CORRUPT_CHUNK;
             }
 
-            for (int i = 0; i < rleCount; ++i)
+            for (uint32_t i = 0; i < rleCount; ++i)
                 dst[dstIdx + (uint64_t) i] = dst[dstIdx - 1];
 
-            dstIdx += (uint64_t) rleCount;
+            dstIdx += rleCount;
 
             buffer = buffer << 8;
             bufferNumBits -= 8;
@@ -1712,24 +1736,6 @@ fasthuf_decode (
             dst[dstIdx] = (uint16_t) symbol;
             dstIdx++;
         }
-
-        //
-        // refill bit stream buffer if we're below the number of
-        // bits needed for a table lookup
-        //
-
-        if (bufferNumBits < 64)
-        {
-            FastHufDecoder_refill (
-                &buffer,
-                64 - bufferNumBits,
-                &bufferBack,
-                &bufferBackNumBits,
-                &currByte,
-                &numSrcBits);
-
-            bufferNumBits = 64;
-        }
     }
 
     if (numSrcBits != 0)