dotnet · VSadov · Dec 28, 2023 · Dec 11, 2023 · Dec 10, 2023 · Dec 11, 2023
diff --git a/src/coreclr/inc/gcinfodecoder.h b/src/coreclr/inc/gcinfodecoder.h
@@ -265,6 +265,7 @@ class BitStreamReader
 
         m_pCurrent = m_pBuffer = dac_cast<PTR_size_t>((size_t)dac_cast<TADDR>(pBuffer) & ~((size_t)sizeof(size_t)-1));
         m_RelPos = m_InitialRelPos = (int)((size_t)dac_cast<TADDR>(pBuffer) % sizeof(size_t)) * 8/*BITS_PER_BYTE*/;
+        m_current = *m_pCurrent >> m_RelPos;
     }
 
     BitStreamReader(const BitStreamReader& other)
@@ -275,6 +276,7 @@ class BitStreamReader
         m_InitialRelPos = other.m_InitialRelPos;
         m_pCurrent = other.m_pCurrent;
         m_RelPos = other.m_RelPos;
+        m_current = other.m_current;
     }
 
     const BitStreamReader& operator=(const BitStreamReader& other)
@@ -285,6 +287,7 @@ class BitStreamReader
         m_InitialRelPos = other.m_InitialRelPos;
         m_pCurrent = other.m_pCurrent;
         m_RelPos = other.m_RelPos;
+        m_current = other.m_current;
         return *this;
     }
 
@@ -295,33 +298,35 @@ class BitStreamReader
 
         _ASSERTE(numBits > 0 && numBits <= BITS_PER_SIZE_T);
 
-        size_t result = (*m_pCurrent) >> m_RelPos;
+        size_t result = m_current;
+        m_current >>= numBits;
         int newRelPos = m_RelPos + numBits;
         if(newRelPos >= BITS_PER_SIZE_T)
         {
             m_pCurrent++;
+            m_current = *m_pCurrent;
             newRelPos -= BITS_PER_SIZE_T;
-            if(newRelPos > 0)
-            {
-                size_t extraBits = (*m_pCurrent) << (numBits - newRelPos);
-                result ^= extraBits;
-            }
+            size_t extraBits = m_current << (numBits - newRelPos);
+            result |= extraBits;
+            m_current >>= newRelPos;
         }
         m_RelPos = newRelPos;
-        result &= SAFE_SHIFT_LEFT(1, numBits) - 1;
+        result &= ((size_t)-1 >> (BITS_PER_SIZE_T - numBits));
         return result;
     }
 
-    // This version reads one bit, returning zero/non-zero (not 0/1)
+    // This version reads one bit
     // NOTE: This routine is perf-critical
     __forceinline size_t ReadOneFast()
     {
         SUPPORTS_DAC;
 
-        size_t result = (*m_pCurrent) & (((size_t)1) << m_RelPos);
+        size_t result = m_current & 1;
+        m_current >>= 1;
         if(++m_RelPos == BITS_PER_SIZE_T)
         {
             m_pCurrent++;
+            m_current = *m_pCurrent;
             m_RelPos = 0;
         }
         return result;
@@ -339,6 +344,7 @@ class BitStreamReader
         size_t adjPos = pos + m_InitialRelPos;
         m_pCurrent = m_pBuffer + adjPos / BITS_PER_SIZE_T;
         m_RelPos = (int)(adjPos % BITS_PER_SIZE_T);
+        m_current = *m_pCurrent >> m_RelPos;
         _ASSERTE(GetCurrentPos() == pos);
     }
 
@@ -349,19 +355,6 @@ class BitStreamReader
         SetCurrentPos(GetCurrentPos() + numBitsToSkip);
     }
 
-    __forceinline void AlignUpToByte()
-    {
-        if(m_RelPos <= BITS_PER_SIZE_T - 8)
-        {
-            m_RelPos = (m_RelPos + 7) & ~7;
-        }
-        else
-        {
-            m_RelPos = 0;
-            m_pCurrent++;
-        }
-    }
-
     __forceinline size_t ReadBitAtPos( size_t pos )
     {
         size_t adjPos = pos + m_InitialRelPos;
@@ -376,17 +369,17 @@ class BitStreamReader
     // See the corresponding methods on BitStreamWriter for more information on the format
     //--------------------------------------------------------------------------
 
-    inline size_t DecodeVarLengthUnsigned( int base )
+    size_t DecodeVarLengthUnsignedMore ( int base )
     {
         _ASSERTE((base > 0) && (base < (int)BITS_PER_SIZE_T));
         size_t numEncodings = size_t{ 1 } << base;
-        size_t result = 0;
-        for(int shift=0; ; shift+=base)
+        size_t result = numEncodings;
+        for(int shift=base; ; shift+=base)
         {
             _ASSERTE(shift+base <= (int)BITS_PER_SIZE_T);
 
             size_t currentChunk = Read(base+1);
-            result |= (currentChunk & (numEncodings-1)) << shift;
+            result ^= (currentChunk & (numEncodings-1)) << shift;
             if(!(currentChunk & numEncodings))
             {
                 // Extension bit is not set, we're done.
@@ -395,6 +388,19 @@ class BitStreamReader
         }
     }
 
+    __forceinline size_t DecodeVarLengthUnsigned(int base)
+    {
+        _ASSERTE((base > 0) && (base < (int)BITS_PER_SIZE_T));
+
+        size_t result = Read(base + 1);
+        if (result & ((size_t)1 << base))
+        {
+            result ^= DecodeVarLengthUnsignedMore(base);
+        }
+
+        return result;
+    }
+
     inline SSIZE_T DecodeVarLengthSigned( int base )
     {
         _ASSERTE((base > 0) && (base < (int)BITS_PER_SIZE_T));
@@ -422,6 +428,7 @@ class BitStreamReader
     int m_InitialRelPos;
     PTR_size_t m_pCurrent;
     int m_RelPos;
+    size_t m_current;
 };
 
 struct GcSlotDesc
@@ -565,15 +572,8 @@ class GcInfoDecoder
     UINT32  m_InstructionOffset;
 
     // Pre-decoded information
+    GcInfoHeaderFlags m_headerFlags;
     bool    m_IsInterruptible;
-    bool    m_IsVarArg;
-    bool    m_GenericSecretParamIsMD;
-    bool    m_GenericSecretParamIsMT;
-#ifdef TARGET_AMD64
-    bool    m_WantsReportOnlyLeaf;
-#elif defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
-    bool    m_HasTailCalls;
-#endif // TARGET_AMD64
     INT32   m_GSCookieStackSlot;
     INT32   m_ReversePInvokeFrameStackSlot;
     UINT32  m_ValidRangeStart;
@@ -590,7 +590,8 @@ class GcInfoDecoder
 #ifdef PARTIALLY_INTERRUPTIBLE_GC_SUPPORTED
     UINT32  m_NumSafePoints;
     UINT32  m_SafePointIndex;
-    UINT32 FindSafePoint(UINT32 codeOffset);
+    UINT32  NarrowSafePointSearch(size_t savedPos, UINT32 breakOffset, UINT32* searchEnd);
+    UINT32  FindSafePoint(UINT32 codeOffset);
 #endif
     UINT32  m_NumInterruptibleRanges;
 
@@ -604,6 +605,8 @@ class GcInfoDecoder
 #endif
     UINT32 m_Version;
 
+    bool PredecodeFatHeader(int remainingFlags);
+
     static bool SetIsInterruptibleCB (UINT32 startOffset, UINT32 stopOffset, void * hCallback);
 
     OBJECTREF* GetRegisterSlot(

diff --git a/src/coreclr/inc/gcinfotypes.h b/src/coreclr/inc/gcinfotypes.h
@@ -9,6 +9,10 @@
 #include "gcinfo.h"
 #endif
 
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif // _MSC_VER
+
 // *****************************************************************************
 // WARNING!!!: These values and code are also used by SOS in the diagnostics
 // repo. Should updated in a backwards and forwards compatible way.
@@ -43,14 +47,31 @@ __forceinline size_t SAFE_SHIFT_RIGHT(size_t x, size_t count)
 
 inline UINT32 CeilOfLog2(size_t x)
 {
+    // it is ok to use bsr or clz unconditionally
     _ASSERTE(x > 0);
-    UINT32 result = (x & (x - 1)) ? 1 : 0;
-    while (x != 1)
-    {
-        result++;
-        x >>= 1;
-    }
-    return result;
+
+    x = (x << 1) - 1;
+
+#ifdef TARGET_64BIT
+#ifdef _MSC_VER
+    DWORD result;
+    _BitScanReverse64(&result, (unsigned long)x);
+    return (UINT32)result;
+#else // _MSC_VER
+    // LZCNT returns index starting from MSB, whereas BSR gives the index from LSB.
+    // 63 ^ LZCNT here is equivalent to 63 - LZCNT since the LZCNT result is always between 0 and 63.
+    // This saves an instruction, as subtraction from constant requires either MOV/SUB or NEG/ADD.
+    return (UINT32)63 ^ (UINT32)__builtin_clzl((unsigned long)x);
+#endif // _MSC_VER
+#else // TARGET_64BIT
+#ifdef _MSC_VER
+    DWORD result;
+    _BitScanReverse(&result, (unsigned int)x);
+    return (UINT32)result;
+#else // _MSC_VER
+    return (UINT32)31 ^ (UINT32)__builtin_clz((unsigned int)x);
+#endif // _MSC_VER
+#endif
 }
 
 enum GcSlotFlags