From 1a49f612517d24e67e5c18201768d811af88b591 Mon Sep 17 00:00:00 2001
From: cuteant <cuteant@outlook.com>
Date: Wed, 23 Jun 2021 02:00:35 +0800
Subject: [PATCH 1/5] Use C# compiler provided nint/nuint

---
 .../Internal/ASCIIUtility.Helpers.cs          |    6 +-
 .../{ASCIIUtility.x64.cs => ASCIIUtility.cs}  |   54 +-
 .../Internal/ASCIIUtility.x32.cs              | 1729 -----------------
 .../Internal/PlatformDependent.cs             |    2 +-
 .../Internal/SpanHelpers.Byte.cs              |   27 +-
 .../Internal/SpanHelpers.Char.cs              |  466 +----
 .../Internal/TextEncodings.Utf16.NetCore3.cs  |    9 +-
 .../Internal/TextEncodings.Utf8.NetCore3.cs   |   17 +-
 ...lidation.cs => Utf16Utility.Validation.cs} |   43 +-
 src/DotNetty.Common/Internal/Utf16Utility.cs  |    2 +-
 .../Internal/Utf16Utility64.Validation.cs     |  433 -----
 ...nscoding.cs => Utf8Utility.Transcoding.cs} |   18 +-
 ...alidation.cs => Utf8Utility.Validation.cs} |   14 +-
 src/DotNetty.Common/Internal/Utf8Utility.cs   |    4 +-
 .../Internal/Utf8Utility32.Transcoding.cs     | 1477 --------------
 .../Internal/Utf8Utility32.Validation.cs      |  736 -------
 .../Utilities/AsciiString.NetCore3.cs         |    4 +-
 17 files changed, 153 insertions(+), 4888 deletions(-)
 rename src/DotNetty.Common/Internal/{ASCIIUtility.x64.cs => ASCIIUtility.cs} (97%)
 delete mode 100644 src/DotNetty.Common/Internal/ASCIIUtility.x32.cs
 rename src/DotNetty.Common/Internal/{Utf16Utility32.Validation.cs => Utf16Utility.Validation.cs} (94%)
 delete mode 100644 src/DotNetty.Common/Internal/Utf16Utility64.Validation.cs
 rename src/DotNetty.Common/Internal/{Utf8Utility64.Transcoding.cs => Utf8Utility.Transcoding.cs} (98%)
 rename src/DotNetty.Common/Internal/{Utf8Utility64.Validation.cs => Utf8Utility.Validation.cs} (98%)
 delete mode 100644 src/DotNetty.Common/Internal/Utf8Utility32.Transcoding.cs
 delete mode 100644 src/DotNetty.Common/Internal/Utf8Utility32.Validation.cs
diff --git a/src/DotNetty.Common/Internal/ASCIIUtility.Helpers.cs b/src/DotNetty.Common/Internal/ASCIIUtility.Helpers.cs
index ec23348ae..189d4a4d8 100644
--- a/src/DotNetty.Common/Internal/ASCIIUtility.Helpers.cs
+++ b/src/DotNetty.Common/Internal/ASCIIUtility.Helpers.cs
@@ -13,17 +13,17 @@
 
 namespace DotNetty.Common.Internal
 {
-    internal static class ASCIIUtility
+    partial class ASCIIUtility
     {
         /// <summary>
         /// A mask which selects only the high bit of each byte of the given <see cref="uint"/>.
         /// </summary>
-        internal const uint UInt32HighBitsOnlyMask = 0x80808080u;
+        private const uint UInt32HighBitsOnlyMask = 0x80808080u;
 
         /// <summary>
         /// A mask which selects only the high bit of each byte of the given <see cref="ulong"/>.
         /// </summary>
-        internal const ulong UInt64HighBitsOnlyMask = 0x80808080_80808080ul;
+        private const ulong UInt64HighBitsOnlyMask = 0x80808080_80808080ul;
 
         /// <summary>
         /// Returns <see langword="true"/> iff all bytes in <paramref name="value"/> are ASCII.
diff --git a/src/DotNetty.Common/Internal/ASCIIUtility.x64.cs b/src/DotNetty.Common/Internal/ASCIIUtility.cs
similarity index 97%
rename from src/DotNetty.Common/Internal/ASCIIUtility.x64.cs
rename to src/DotNetty.Common/Internal/ASCIIUtility.cs
index e518ddec8..cb419bb84 100644
--- a/src/DotNetty.Common/Internal/ASCIIUtility.x64.cs
+++ b/src/DotNetty.Common/Internal/ASCIIUtility.cs
@@ -1,4 +1,4 @@
-﻿// borrowed from https://github.com/dotnet/corefx/blob/release/3.1/src/Common/src/CoreLib/System/Text/ASCIIUtility.cs
+﻿// borrowed from https://github.com/dotnet/corefx/blob/release/3.1/src/Common/src/CoreLib/System/Text/cs
 
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
@@ -11,27 +11,17 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
-using nint = System.Int64;
-using nuint = System.UInt64;
 
 namespace DotNetty.Common.Internal
 {
-    internal static partial class ASCIIUtility64
+    internal static partial class ASCIIUtility
     {
-#if DEBUG
-        static ASCIIUtility64()
-        {
-            Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
-            Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
-        }
-#endif // DEBUG
-
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static bool AllBytesInUInt64AreAscii(ulong value)
         {
             // If the high bit of any byte is set, that byte is non-ASCII.
 
-            return (0ul >= (value & ASCIIUtility.UInt64HighBitsOnlyMask));
+            return (0ul >= (value & UInt64HighBitsOnlyMask));
         }
 
         /// <summary>
@@ -150,12 +140,12 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, n
                 currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
                 uint nextUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer + 4);
 
-                if (!ASCIIUtility.AllBytesInUInt32AreAscii(currentUInt32 | nextUInt32))
+                if (!AllBytesInUInt32AreAscii(currentUInt32 | nextUInt32))
                 {
                     // One of these two values contains non-ASCII bytes.
                     // Figure out which one it is, then put it in 'current' so that we can drain the ASCII bytes.
 
-                    if (ASCIIUtility.AllBytesInUInt32AreAscii(currentUInt32))
+                    if (AllBytesInUInt32AreAscii(currentUInt32))
                     {
                         currentUInt32 = nextUInt32;
                         pBuffer += 4;
@@ -173,7 +163,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, n
             if ((bufferLength & 4) != 0)
             {
                 currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
-                if (!ASCIIUtility.AllBytesInUInt32AreAscii(currentUInt32))
+                if (!AllBytesInUInt32AreAscii(currentUInt32))
                 {
                     goto FoundNonAsciiData;
                 }
@@ -186,7 +176,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, n
             if ((bufferLength & 2) != 0)
             {
                 currentUInt32 = Unsafe.ReadUnaligned<ushort>(pBuffer);
-                if (!ASCIIUtility.AllBytesInUInt32AreAscii(currentUInt32))
+                if (!AllBytesInUInt32AreAscii(currentUInt32))
                 {
                     goto FoundNonAsciiData;
                 }
@@ -214,14 +204,14 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, n
 
         FoundNonAsciiData:
 
-            Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input.");
+            Debug.Assert(!AllBytesInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input.");
 
             // The method being called doesn't bother looking at whether the high byte is ASCII. There are only
             // two scenarios: (a) either one of the earlier bytes is not ASCII and the search terminates before
             // we get to the high byte; or (b) all of the earlier bytes are ASCII, so the high byte must be
             // non-ASCII. In both cases we only care about the low 24 bits.
 
-            pBuffer += ASCIIUtility.CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentUInt32);
+            pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentUInt32);
             goto Finish;
         }
 
@@ -381,8 +371,8 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin
         FoundNonAsciiDataInCurrentDWord:
 
             uint currentDWord;
-            Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
-            pBuffer += ASCIIUtility.CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentDWord);
+            Debug.Assert(!AllBytesInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
+            pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentDWord);
 
             goto Finish;
 
@@ -408,7 +398,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin
                         // Clear everything but the high bit of each byte, then tzcnt.
                         // Remember the / 8 at the end to convert bit count to byte count.
 
-                        candidateUInt64 &= ASCIIUtility.UInt64HighBitsOnlyMask;
+                        candidateUInt64 &= UInt64HighBitsOnlyMask;
                         pBuffer += (nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8);
                         goto Finish;
                     }
@@ -420,12 +410,12 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin
                     currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
                     uint nextDWord = Unsafe.ReadUnaligned<uint>(pBuffer + 4);
 
-                    if (!ASCIIUtility.AllBytesInUInt32AreAscii(currentDWord | nextDWord))
+                    if (!AllBytesInUInt32AreAscii(currentDWord | nextDWord))
                     {
                         // At least one of the values wasn't all-ASCII.
                         // We need to figure out which one it was and stick it in the currentMask local.
 
-                        if (ASCIIUtility.AllBytesInUInt32AreAscii(currentDWord))
+                        if (AllBytesInUInt32AreAscii(currentDWord))
                         {
                             currentDWord = nextDWord; // this one is the culprit
                             pBuffer += 4;
@@ -444,7 +434,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin
             {
                 currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
 
-                if (!ASCIIUtility.AllBytesInUInt32AreAscii(currentDWord))
+                if (!AllBytesInUInt32AreAscii(currentDWord))
                 {
                     goto FoundNonAsciiDataInCurrentDWord;
                 }
@@ -459,7 +449,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin
             {
                 currentDWord = Unsafe.ReadUnaligned<ushort>(pBuffer);
 
-                if (!ASCIIUtility.AllBytesInUInt32AreAscii(currentDWord))
+                if (!AllBytesInUInt32AreAscii(currentDWord))
                 {
                     // We only care about the 0x0080 bit of the value. If it's not set, then we
                     // increment currentOffset by 1. If it's set, we don't increment it at all.
@@ -512,7 +502,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Default(char* pBuffer, n
 
             char* pOriginalBuffer = pBuffer;
 
+#if NET
             Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
+#endif
 
             // Before we drain off char-by-char, try a generic vectorized loop.
             // Only run the loop if we have at least two vectors we can pull out.
@@ -676,7 +668,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
             Vector128<short> asciiMaskForPXOR = Vector128.Create(unchecked((short)0x8000)); // used for PXOR
             Vector128<short> asciiMaskForPCMPGTW = Vector128.Create(unchecked((short)0x807F)); // used for PCMPGTW
 
+#if NET
             Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
+#endif
 
             // Read the first vector unaligned.
 
@@ -1526,7 +1520,7 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf
                 do
                 {
                     asciiData = Unsafe.ReadUnaligned<uint>(pAsciiBuffer + currentOffset);
-                    if (!ASCIIUtility.AllBytesInUInt32AreAscii(asciiData))
+                    if (!AllBytesInUInt32AreAscii(asciiData))
                     {
                         goto FoundNonAsciiData;
                     }
@@ -1541,7 +1535,7 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf
             if (((uint)remainingElementCount & 2) != 0)
             {
                 asciiData = Unsafe.ReadUnaligned<ushort>(pAsciiBuffer + currentOffset);
-                if (!ASCIIUtility.AllBytesInUInt32AreAscii(asciiData))
+                if (!AllBytesInUInt32AreAscii(asciiData))
                 {
                     goto FoundNonAsciiData;
                 }
@@ -1580,7 +1574,7 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf
 
         FoundNonAsciiData:
 
-            Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(asciiData), "Shouldn't have reached this point if we have an all-ASCII input.");
+            Debug.Assert(!AllBytesInUInt32AreAscii(asciiData), "Shouldn't have reached this point if we have an all-ASCII input.");
 
             // Drain ASCII bytes one at a time.
 
@@ -1693,7 +1687,7 @@ private static unsafe nuint WidenAsciiToUtf16_Sse2(byte* pAsciiBuffer, char* pUt
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value)
         {
-            Debug.Assert(ASCIIUtility.AllBytesInUInt32AreAscii(value));
+            Debug.Assert(AllBytesInUInt32AreAscii(value));
 
             if (Bmi2.X64.IsSupported)
             {
diff --git a/src/DotNetty.Common/Internal/ASCIIUtility.x32.cs b/src/DotNetty.Common/Internal/ASCIIUtility.x32.cs
deleted file mode 100644
index 5b1b80dcd..000000000
--- a/src/DotNetty.Common/Internal/ASCIIUtility.x32.cs
+++ /dev/null
@@ -1,1729 +0,0 @@
-﻿// borrowed from https://github.com/dotnet/corefx/blob/release/3.1/src/Common/src/CoreLib/System/Text/ASCIIUtility.cs
-
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-#if NETCOREAPP_3_0_GREATER
-using System;
-using System.Diagnostics;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
-using nint = System.Int32;
-using nuint = System.UInt32;
-
-namespace DotNetty.Common.Internal
-{
-    internal static class ASCIIUtility32
-    {
-#if DEBUG
-        static ASCIIUtility32()
-        {
-            Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
-            Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
-        }
-#endif // DEBUG
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool AllBytesInUInt64AreAscii(ulong value)
-        {
-            // If the high bit of any byte is set, that byte is non-ASCII.
-
-            return (0ul >= (value & ASCIIUtility.UInt64HighBitsOnlyMask));
-        }
-
-        /// <summary>
-        /// Returns <see langword="true"/> iff all chars in <paramref name="value"/> are ASCII.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool AllCharsInUInt32AreAscii(uint value)
-        {
-            return (0u >= (value & ~0x007F007Fu));
-        }
-
-        /// <summary>
-        /// Returns <see langword="true"/> iff all chars in <paramref name="value"/> are ASCII.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static bool AllCharsInUInt64AreAscii(ulong value)
-        {
-            return (0ul >= (value & ~0x007F007F_007F007Ful));
-        }
-
-        /// <summary>
-        /// Given a DWORD which represents two packed chars in machine-endian order,
-        /// <see langword="true"/> iff the first char (in machine-endian order) is ASCII.
-        /// </summary>
-        /// <param name="value"></param>
-        /// <returns></returns>
-        private static bool FirstCharInUInt32IsAscii(uint value)
-        {
-            return (BitConverter.IsLittleEndian && 0u >= (value & 0xFF80u))
-                || (!BitConverter.IsLittleEndian && 0u >= (value & 0xFF800000u));
-        }
-
-        /// <summary>
-        /// Returns the index in <paramref name="pBuffer"/> where the first non-ASCII byte is found.
-        /// Returns <paramref name="bufferLength"/> if the buffer is empty or all-ASCII.
-        /// </summary>
-        /// <returns>An ASCII byte is defined as 0x00 - 0x7F, inclusive.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bufferLength)
-        {
-            // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
-            // code below. This has two benefits: (a) we can take advantage of specific instructions like
-            // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
-            // this method is running.
-
-            return (Sse2.IsSupported)
-                ? GetIndexOfFirstNonAsciiByte_Sse2(pBuffer, bufferLength)
-                : GetIndexOfFirstNonAsciiByte_Default(pBuffer, bufferLength);
-        }
-
-        private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, nuint bufferLength)
-        {
-            // Squirrel away the original buffer reference. This method works by determining the exact
-            // byte reference where non-ASCII data begins, so we need this base value to perform the
-            // final subtraction at the end of the method to get the index into the original buffer.
-
-            byte* pOriginalBuffer = pBuffer;
-
-            // Before we drain off byte-by-byte, try a generic vectorized loop.
-            // Only run the loop if we have at least two vectors we can pull out.
-            // Note use of SBYTE instead of BYTE below; we're using the two's-complement
-            // representation of negative integers to act as a surrogate for "is ASCII?".
-
-            if (Vector.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector<sbyte>.Count)
-            {
-                uint SizeOfVectorInBytes = (uint)Vector<sbyte>.Count; // JIT will make this a const
-
-                if (Vector.GreaterThanOrEqualAll(Unsafe.ReadUnaligned<Vector<sbyte>>(pBuffer), Vector<sbyte>.Zero))
-                {
-                    // The first several elements of the input buffer were ASCII. Bump up the pointer to the
-                    // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
-                    // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
-
-                    byte* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVectorInBytes;
-                    pBuffer = (byte*)(((nuint)pBuffer + SizeOfVectorInBytes) & ~(nuint)(SizeOfVectorInBytes - 1));
-
-#if DEBUG
-                    long numBytesRead = pBuffer - pOriginalBuffer;
-                    Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVectorInBytes, "We should've made forward progress of at least one byte.");
-                    Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
-#endif
-
-                    Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector.");
-
-                    do
-                    {
-                        Debug.Assert((nuint)pBuffer % SizeOfVectorInBytes == 0, "Vector read should be aligned.");
-                        if (Vector.LessThanAny(Unsafe.Read<Vector<sbyte>>(pBuffer), Vector<sbyte>.Zero))
-                        {
-                            break; // found non-ASCII data
-                        }
-
-                        pBuffer += SizeOfVectorInBytes;
-                    } while (pBuffer <= pFinalVectorReadPos);
-
-                    // Adjust the remaining buffer length for the number of elements we just consumed.
-
-                    bufferLength -= (nuint)pBuffer;
-                    bufferLength += (nuint)pOriginalBuffer;
-                }
-            }
-
-            // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform
-            // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code
-            // path to drain any remaining ASCII bytes.
-            //
-            // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads.
-            // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII bytes.
-
-            uint currentUInt32;
-
-            // Try reading 64 bits at a time in a loop.
-
-            for (; bufferLength >= 8; bufferLength -= 8)
-            {
-                currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
-                uint nextUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer + 4);
-
-                if (!ASCIIUtility.AllBytesInUInt32AreAscii(currentUInt32 | nextUInt32))
-                {
-                    // One of these two values contains non-ASCII bytes.
-                    // Figure out which one it is, then put it in 'current' so that we can drain the ASCII bytes.
-
-                    if (ASCIIUtility.AllBytesInUInt32AreAscii(currentUInt32))
-                    {
-                        currentUInt32 = nextUInt32;
-                        pBuffer += 4;
-                    }
-
-                    goto FoundNonAsciiData;
-                }
-
-                pBuffer += 8; // consumed 8 ASCII bytes
-            }
-
-            // From this point forward we don't need to update bufferLength.
-            // Try reading 32 bits.
-
-            if ((bufferLength & 4) != 0)
-            {
-                currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
-                if (!ASCIIUtility.AllBytesInUInt32AreAscii(currentUInt32))
-                {
-                    goto FoundNonAsciiData;
-                }
-
-                pBuffer += 4;
-            }
-
-            // Try reading 16 bits.
-
-            if ((bufferLength & 2) != 0)
-            {
-                currentUInt32 = Unsafe.ReadUnaligned<ushort>(pBuffer);
-                if (!ASCIIUtility.AllBytesInUInt32AreAscii(currentUInt32))
-                {
-                    goto FoundNonAsciiData;
-                }
-
-                pBuffer += 2;
-            }
-
-            // Try reading 8 bits
-
-            if ((bufferLength & 1) != 0)
-            {
-                // If the buffer contains non-ASCII data, the comparison below will fail, and
-                // we'll end up not incrementing the buffer reference.
-
-                if (*(sbyte*)pBuffer >= 0)
-                {
-                    pBuffer++;
-                }
-            }
-
-        Finish:
-
-            nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer;
-            return totalNumBytesRead;
-
-        FoundNonAsciiData:
-
-            Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input.");
-
-            // The method being called doesn't bother looking at whether the high byte is ASCII. There are only
-            // two scenarios: (a) either one of the earlier bytes is not ASCII and the search terminates before
-            // we get to the high byte; or (b) all of the earlier bytes are ASCII, so the high byte must be
-            // non-ASCII. In both cases we only care about the low 24 bits.
-
-            pBuffer += ASCIIUtility.CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentUInt32);
-            goto Finish;
-        }
-
-        private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuint bufferLength)
-        {
-            // JIT turns the below into constants
-
-            uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
-            nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
-
-            Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
-            Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
-
-            uint currentMask, secondMask;
-            byte* pOriginalBuffer = pBuffer;
-
-            // This method is written such that control generally flows top-to-bottom, avoiding
-            // jumps as much as possible in the optimistic case of a large enough buffer and
-            // "all ASCII". If we see non-ASCII data, we jump out of the hot paths to targets
-            // after all the main logic.
-
-            if (bufferLength < SizeOfVector128)
-            {
-                goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
-            }
-
-            // Read the first vector unaligned.
-
-            currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
-
-            if (currentMask != 0)
-            {
-                goto FoundNonAsciiDataInCurrentMask;
-            }
-
-            // If we have less than 32 bytes to process, just go straight to the final unaligned
-            // read. There's no need to mess with the loop logic in the middle of this method.
-
-            if (bufferLength < 2 * SizeOfVector128)
-            {
-                goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
-            }
-
-            // Now adjust the read pointer so that future reads are aligned.
-
-            pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector128) & ~(nuint)MaskOfAllBitsInVector128);
-
-#if DEBUG
-            long numBytesRead = pBuffer - pOriginalBuffer;
-            Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector128, "We should've made forward progress of at least one byte.");
-            Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
-#endif
-
-            // Adjust the remaining length to account for what we just read.
-
-            bufferLength += (nuint)pOriginalBuffer;
-            bufferLength -= (nuint)pBuffer;
-
-            // The buffer is now properly aligned.
-            // Read 2 vectors at a time if possible.
-
-            if (bufferLength >= 2 * SizeOfVector128)
-            {
-                byte* pFinalVectorReadPos = (byte*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128);
-
-                // After this point, we no longer need to update the bufferLength value.
-
-                do
-                {
-                    Vector128<byte> firstVector = Sse2.LoadAlignedVector128(pBuffer);
-                    Vector128<byte> secondVector = Sse2.LoadAlignedVector128(pBuffer + SizeOfVector128);
-
-                    currentMask = (uint)Sse2.MoveMask(firstVector);
-                    secondMask = (uint)Sse2.MoveMask(secondVector);
-
-                    if ((currentMask | secondMask) != 0)
-                    {
-                        goto FoundNonAsciiDataInInnerLoop;
-                    }
-
-                    pBuffer += 2 * SizeOfVector128;
-                } while (pBuffer <= pFinalVectorReadPos);
-            }
-
-            // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
-            // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
-            // But we _can_ rely on it to tell us how much remaining data must be drained by looking
-            // at what bits of it are set. This works because had we updated it within the loop above,
-            // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
-            // bits which are less significant than those that the addition would've acted on.
-
-            // If there is fewer than one vector length remaining, skip the next aligned read.
-
-            if (0u >= (bufferLength & SizeOfVector128))
-            {
-                goto DoFinalUnalignedVectorRead;
-            }
-
-            // At least one full vector's worth of data remains, so we can safely read it.
-            // Remember, at this point pBuffer is still aligned.
-
-            currentMask = (uint)Sse2.MoveMask(Sse2.LoadAlignedVector128(pBuffer));
-            if (currentMask != 0)
-            {
-                goto FoundNonAsciiDataInCurrentMask;
-            }
-
-        IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
-
-            pBuffer += SizeOfVector128;
-
-        DoFinalUnalignedVectorRead:
-
-            if (((byte)bufferLength & MaskOfAllBitsInVector128) != 0)
-            {
-                // Perform an unaligned read of the last vector.
-                // We need to adjust the pointer because we're re-reading data.
-
-                pBuffer += (bufferLength & MaskOfAllBitsInVector128) - SizeOfVector128;
-
-                currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
-                if (currentMask != 0)
-                {
-                    goto FoundNonAsciiDataInCurrentMask;
-                }
-
-                pBuffer += SizeOfVector128;
-            }
-
-        Finish:
-
-            return (nuint)pBuffer - (nuint)pOriginalBuffer; // and we're done!
-
-        FoundNonAsciiDataInInnerLoop:
-
-            // If the current (first) mask isn't the mask that contains non-ASCII data, then it must
-            // instead be the second mask. If so, skip the entire first mask and drain ASCII bytes
-            // from the second mask.
-
-            if (0u >= currentMask)
-            {
-                pBuffer += SizeOfVector128;
-                currentMask = secondMask;
-            }
-
-        FoundNonAsciiDataInCurrentMask:
-
-            // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
-            // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
-            // available, we'll fall back to a normal loop.
-
-            Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
-            pBuffer += (uint)BitOperations.TrailingZeroCount(currentMask);
-
-            goto Finish;
-
-        FoundNonAsciiDataInCurrentDWord:
-
-            uint currentDWord;
-            Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
-            pBuffer += ASCIIUtility.CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentDWord);
-
-            goto Finish;
-
-        InputBufferLessThanOneVectorInLength:
-
-            // These code paths get hit if the original input length was less than one vector in size.
-            // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
-            // directly. Note that all of these reads are unaligned.
-
-            Debug.Assert(bufferLength < SizeOfVector128);
-
-            // QWORD drain
-
-            if ((bufferLength & 8) != 0)
-            {
-                if (Bmi1.X64.IsSupported)
-                {
-                    // If we can use 64-bit tzcnt to count the number of leading ASCII bytes, prefer it.
-
-                    ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
-                    if (!AllBytesInUInt64AreAscii(candidateUInt64))
-                    {
-                        // Clear everything but the high bit of each byte, then tzcnt.
-                        // Remember the / 8 at the end to convert bit count to byte count.
-
-                        candidateUInt64 &= ASCIIUtility.UInt64HighBitsOnlyMask;
-                        pBuffer += (nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8);
-                        goto Finish;
-                    }
-                }
-                else
-                {
-                    // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.
-
-                    currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
-                    uint nextDWord = Unsafe.ReadUnaligned<uint>(pBuffer + 4);
-
-                    if (!ASCIIUtility.AllBytesInUInt32AreAscii(currentDWord | nextDWord))
-                    {
-                        // At least one of the values wasn't all-ASCII.
-                        // We need to figure out which one it was and stick it in the currentMask local.
-
-                        if (ASCIIUtility.AllBytesInUInt32AreAscii(currentDWord))
-                        {
-                            currentDWord = nextDWord; // this one is the culprit
-                            pBuffer += 4;
-                        }
-
-                        goto FoundNonAsciiDataInCurrentDWord;
-                    }
-                }
-
-                pBuffer += 8; // successfully consumed 8 ASCII bytes
-            }
-
-            // DWORD drain
-
-            if ((bufferLength & 4) != 0)
-            {
-                currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
-
-                if (!ASCIIUtility.AllBytesInUInt32AreAscii(currentDWord))
-                {
-                    goto FoundNonAsciiDataInCurrentDWord;
-                }
-
-                pBuffer += 4; // successfully consumed 4 ASCII bytes
-            }
-
-            // WORD drain
-            // (We movzx to a DWORD for ease of manipulation.)
-
-            if ((bufferLength & 2) != 0)
-            {
-                currentDWord = Unsafe.ReadUnaligned<ushort>(pBuffer);
-
-                if (!ASCIIUtility.AllBytesInUInt32AreAscii(currentDWord))
-                {
-                    // We only care about the 0x0080 bit of the value. If it's not set, then we
-                    // increment currentOffset by 1. If it's set, we don't increment it at all.
-
-                    pBuffer += (nuint)((nint)(sbyte)currentDWord >> 7) + 1;
-                    goto Finish;
-                }
-
-                pBuffer += 2; // successfully consumed 2 ASCII bytes
-            }
-
-            // BYTE drain
-
-            if ((bufferLength & 1) != 0)
-            {
-                // sbyte has non-negative value if byte is ASCII.
-
-                if (*(sbyte*)(pBuffer) >= 0)
-                {
-                    pBuffer++; // successfully consumed a single byte
-                }
-            }
-
-            goto Finish;
-        }
-
-        /// <summary>
-        /// Returns the index in <paramref name="pBuffer"/> where the first non-ASCII char is found.
-        /// Returns <paramref name="bufferLength"/> if the buffer is empty or all-ASCII.
-        /// </summary>
-        /// <returns>An ASCII char is defined as 0x0000 - 0x007F, inclusive.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bufferLength /* in chars */)
-        {
-            // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
-            // code below. This has two benefits: (a) we can take advantage of specific instructions like
-            // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
-            // this method is running.
-
-            return (Sse2.IsSupported)
-                ? GetIndexOfFirstNonAsciiChar_Sse2(pBuffer, bufferLength)
-                : GetIndexOfFirstNonAsciiChar_Default(pBuffer, bufferLength);
-        }
-
-        private static unsafe nuint GetIndexOfFirstNonAsciiChar_Default(char* pBuffer, nuint bufferLength /* in chars */)
-        {
-            // Squirrel away the original buffer reference.This method works by determining the exact
-            // char reference where non-ASCII data begins, so we need this base value to perform the
-            // final subtraction at the end of the method to get the index into the original buffer.
-
-            char* pOriginalBuffer = pBuffer;
-
-            Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
-
-            // Before we drain off char-by-char, try a generic vectorized loop.
-            // Only run the loop if we have at least two vectors we can pull out.
-
-            if (Vector.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector<ushort>.Count)
-            {
-                uint SizeOfVectorInChars = (uint)Vector<ushort>.Count; // JIT will make this a const
-                uint SizeOfVectorInBytes = (uint)Vector<byte>.Count; // JIT will make this a const
-
-                Vector<ushort> maxAscii = new Vector<ushort>(0x007F);
-
-                if (Vector.LessThanOrEqualAll(Unsafe.ReadUnaligned<Vector<ushort>>(pBuffer), maxAscii))
-                {
-                    // The first several elements of the input buffer were ASCII. Bump up the pointer to the
-                    // next aligned boundary, then perform aligned reads from here on out until we find non-ASCII
-                    // data or we approach the end of the buffer. It's possible we'll reread data; this is ok.
-
-                    char* pFinalVectorReadPos = pBuffer + bufferLength - SizeOfVectorInChars;
-                    pBuffer = (char*)(((nuint)pBuffer + SizeOfVectorInBytes) & ~(nuint)(SizeOfVectorInBytes - 1));
-
-#if DEBUG
-                    long numCharsRead = pBuffer - pOriginalBuffer;
-                    Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVectorInChars, "We should've made forward progress of at least one char.");
-                    Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
-#endif
-
-                    Debug.Assert(pBuffer <= pFinalVectorReadPos, "Should be able to read at least one vector.");
-
-                    do
-                    {
-                        Debug.Assert((nuint)pBuffer % SizeOfVectorInChars == 0, "Vector read should be aligned.");
-                        if (Vector.GreaterThanAny(Unsafe.Read<Vector<ushort>>(pBuffer), maxAscii))
-                        {
-                            break; // found non-ASCII data
-                        }
-                        pBuffer += SizeOfVectorInChars;
-                    } while (pBuffer <= pFinalVectorReadPos);
-
-                    // Adjust the remaining buffer length for the number of elements we just consumed.
-
-                    bufferLength -= ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char);
-                }
-            }
-
-            // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform
-            // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code
-            // path to drain any remaining ASCII chars.
-            //
-            // We're going to perform unaligned reads, so prefer 32-bit reads instead of 64-bit reads.
-            // This also allows us to perform more optimized bit twiddling tricks to count the number of ASCII chars.
-
-            uint currentUInt32;
-
-            // Try reading 64 bits at a time in a loop.
-
-            for (; bufferLength >= 4; bufferLength -= 4) // 64 bits = 4 * 16-bit chars
-            {
-                currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
-                uint nextUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer + 4 / sizeof(char));
-
-                if (!AllCharsInUInt32AreAscii(currentUInt32 | nextUInt32))
-                {
-                    // One of these two values contains non-ASCII chars.
-                    // Figure out which one it is, then put it in 'current' so that we can drain the ASCII chars.
-
-                    if (AllCharsInUInt32AreAscii(currentUInt32))
-                    {
-                        currentUInt32 = nextUInt32;
-                        pBuffer += 2;
-                    }
-
-                    goto FoundNonAsciiData;
-                }
-
-                pBuffer += 4; // consumed 4 ASCII chars
-            }
-
-            // From this point forward we don't need to keep track of the remaining buffer length.
-            // Try reading 32 bits.
-
-            if ((bufferLength & 2) != 0) // 32 bits = 2 * 16-bit chars
-            {
-                currentUInt32 = Unsafe.ReadUnaligned<uint>(pBuffer);
-                if (!AllCharsInUInt32AreAscii(currentUInt32))
-                {
-                    goto FoundNonAsciiData;
-                }
-
-                pBuffer += 2;
-            }
-
-            // Try reading 16 bits.
-            // No need to try an 8-bit read after this since we're working with chars.
-
-            if ((bufferLength & 1) != 0)
-            {
-                // If the buffer contains non-ASCII data, the comparison below will fail, and
-                // we'll end up not incrementing the buffer reference.
-
-                if (*pBuffer <= 0x007F)
-                {
-                    pBuffer++;
-                }
-            }
-
-        Finish:
-
-            nuint totalNumBytesRead = (nuint)pBuffer - (nuint)pOriginalBuffer;
-            Debug.Assert(totalNumBytesRead % sizeof(char) == 0, "Total number of bytes read should be even since we're working with chars.");
-            return totalNumBytesRead / sizeof(char); // convert byte count -> char count before returning
-
-        FoundNonAsciiData:
-
-            Debug.Assert(!AllCharsInUInt32AreAscii(currentUInt32), "Shouldn't have reached this point if we have an all-ASCII input.");
-
-            // We don't bother looking at the second char - only the first char.
-
-            if (FirstCharInUInt32IsAscii(currentUInt32))
-            {
-                pBuffer++;
-            }
-
-            goto Finish;
-        }
-
-        private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuint bufferLength /* in chars */)
-        {
-            // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
-            // will be elided by JIT once we determine which specific ISAs we support.
-
-            // Quick check for empty inputs.
-
-            if (0u >= bufferLength)
-            {
-                return 0;
-            }
-
-            // JIT turns the below into constants
-
-            uint SizeOfVector128InBytes = (uint)Unsafe.SizeOf<Vector128<byte>>();
-            uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char);
-
-            Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
-            Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
-
-            Vector128<short> firstVector, secondVector;
-            uint currentMask;
-            char* pOriginalBuffer = pBuffer;
-
-            if (bufferLength < SizeOfVector128InChars)
-            {
-                goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
-            }
-
-            // This method is written such that control generally flows top-to-bottom, avoiding
-            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
-            // data, we jump out of the hot paths to targets at the end of the method.
-
-            Vector128<short> asciiMaskForPTEST = Vector128.Create(unchecked((short)0xFF80)); // used for PTEST on supported hardware
-            Vector128<ushort> asciiMaskForPMINUW = Vector128.Create((ushort)0x0080); // used for PMINUW on supported hardware
-            Vector128<short> asciiMaskForPXOR = Vector128.Create(unchecked((short)0x8000)); // used for PXOR
-            Vector128<short> asciiMaskForPCMPGTW = Vector128.Create(unchecked((short)0x807F)); // used for PCMPGTW
-
-            Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
-
-            // Read the first vector unaligned.
-
-            firstVector = Sse2.LoadVector128((short*)pBuffer); // unaligned load
-
-            if (Sse41.IsSupported)
-            {
-                // The SSE41-optimized code path works by forcing the 0x0080 bit in each WORD of the vector to be
-                // set iff the WORD element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector
-                // in order to extract the mask.
-                currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
-            }
-            else
-            {
-                // The SSE2-optimized code path works by forcing each WORD of the vector to be 0xFFFF iff the WORD
-                // element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector in order to extract
-                // the mask.
-                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
-            }
-
-            if (currentMask != 0)
-            {
-                goto FoundNonAsciiDataInCurrentMask;
-            }
-
-            // If we have less than 32 bytes to process, just go straight to the final unaligned
-            // read. There's no need to mess with the loop logic in the middle of this method.
-
-            // Adjust the remaining length to account for what we just read.
-            // For the remainder of this code path, bufferLength will be in bytes, not chars.
-
-            bufferLength <<= 1; // chars to bytes
-
-            if (bufferLength < 2 * SizeOfVector128InBytes)
-            {
-                goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
-            }
-
-            // Now adjust the read pointer so that future reads are aligned.
-
-            pBuffer = (char*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1));
-
-#if DEBUG
-            long numCharsRead = pBuffer - pOriginalBuffer;
-            Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector128InChars, "We should've made forward progress of at least one char.");
-            Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
-#endif
-
-            // Adjust remaining buffer length.
-
-            bufferLength += (nuint)pOriginalBuffer;
-            bufferLength -= (nuint)pBuffer;
-
-            // The buffer is now properly aligned.
-            // Read 2 vectors at a time if possible.
-
-            if (bufferLength >= 2 * SizeOfVector128InBytes)
-            {
-                char* pFinalVectorReadPos = (char*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes);
-
-                // After this point, we no longer need to update the bufferLength value.
-
-                do
-                {
-                    firstVector = Sse2.LoadAlignedVector128((short*)pBuffer);
-                    secondVector = Sse2.LoadAlignedVector128((short*)pBuffer + SizeOfVector128InChars);
-                    Vector128<short> combinedVector = Sse2.Or(firstVector, secondVector);
-
-                    if (Sse41.IsSupported)
-                    {
-                        // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
-                        // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
-                        if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
-                        {
-                            goto FoundNonAsciiDataInFirstOrSecondVector;
-                        }
-                    }
-                    else
-                    {
-                        // See comment earlier in the method for an explanation of how the below logic works.
-                        if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
-                        {
-                            goto FoundNonAsciiDataInFirstOrSecondVector;
-                        }
-                    }
-
-                    pBuffer += 2 * SizeOfVector128InChars;
-                } while (pBuffer <= pFinalVectorReadPos);
-            }
-
-            // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
-            // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
-            // But we _can_ rely on it to tell us how much remaining data must be drained by looking
-            // at what bits of it are set. This works because had we updated it within the loop above,
-            // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
-            // bits which are less significant than those that the addition would've acted on.
-
-            // If there is fewer than one vector length remaining, skip the next aligned read.
-            // Remember, at this point bufferLength is measured in bytes, not chars.
-
-            if (0u >= (bufferLength & SizeOfVector128InBytes))
-            {
-                goto DoFinalUnalignedVectorRead;
-            }
-
-            // At least one full vector's worth of data remains, so we can safely read it.
-            // Remember, at this point pBuffer is still aligned.
-
-            firstVector = Sse2.LoadAlignedVector128((short*)pBuffer);
-
-            if (Sse41.IsSupported)
-            {
-                // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
-                // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
-                if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
-                {
-                    goto FoundNonAsciiDataInFirstVector;
-                }
-            }
-            else
-            {
-                // See comment earlier in the method for an explanation of how the below logic works.
-                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
-                if (currentMask != 0)
-                {
-                    goto FoundNonAsciiDataInCurrentMask;
-                }
-            }
-
-        IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
-
-            pBuffer += SizeOfVector128InChars;
-
-        DoFinalUnalignedVectorRead:
-
-            if (((byte)bufferLength & (SizeOfVector128InBytes - 1)) != 0)
-            {
-                // Perform an unaligned read of the last vector.
-                // We need to adjust the pointer because we're re-reading data.
-
-                pBuffer = (char*)((byte*)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes);
-                firstVector = Sse2.LoadVector128((short*)pBuffer); // unaligned load
-
-                if (Sse41.IsSupported)
-                {
-                    // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
-                    // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
-                    if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
-                    {
-                        goto FoundNonAsciiDataInFirstVector;
-                    }
-                }
-                else
-                {
-                    // See comment earlier in the method for an explanation of how the below logic works.
-                    currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
-                    if (currentMask != 0)
-                    {
-                        goto FoundNonAsciiDataInCurrentMask;
-                    }
-                }
-
-                pBuffer += SizeOfVector128InChars;
-            }
-
-        Finish:
-
-            Debug.Assert(((nuint)pBuffer - (nuint)pOriginalBuffer) % 2 == 0, "Shouldn't have incremented any pointer by an odd byte count.");
-            return ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char); // and we're done! (remember to adjust for char count)
-
-        FoundNonAsciiDataInFirstOrSecondVector:
-
-            // We don't know if the first or the second vector contains non-ASCII data. Check the first
-            // vector, and if that's all-ASCII then the second vector must be the culprit. Either way
-            // we'll make sure the first vector local is the one that contains the non-ASCII data.
-
-            // See comment earlier in the method for an explanation of how the below logic works.
-            if (Sse41.IsSupported)
-            {
-                if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
-                {
-                    goto FoundNonAsciiDataInFirstVector;
-                }
-            }
-            else
-            {
-                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
-                if (currentMask != 0)
-                {
-                    goto FoundNonAsciiDataInCurrentMask;
-                }
-            }
-
-            // Wasn't the first vector; must be the second.
-
-            pBuffer += SizeOfVector128InChars;
-            firstVector = secondVector;
-
-        FoundNonAsciiDataInFirstVector:
-
-            // See comment earlier in the method for an explanation of how the below logic works.
-            if (Sse41.IsSupported)
-            {
-                currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
-            }
-            else
-            {
-                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
-            }
-
-        FoundNonAsciiDataInCurrentMask:
-
-            // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
-            // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
-            // available, we'll fall back to a normal loop. (Even though the original vector used WORD elements,
-            // masks work on BYTE elements, and we account for this in the final fixup.)
-
-            Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
-            pBuffer = (char*)((byte*)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask));
-
-            goto Finish;
-
-        FoundNonAsciiDataInCurrentDWord:
-
-            uint currentDWord;
-            Debug.Assert(!AllCharsInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
-
-            if (FirstCharInUInt32IsAscii(currentDWord))
-            {
-                pBuffer++; // skip past the ASCII char
-            }
-
-            goto Finish;
-
-        InputBufferLessThanOneVectorInLength:
-
-            // These code paths get hit if the original input length was less than one vector in size.
-            // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
-            // directly. Note that all of these reads are unaligned.
-
-            // Reminder: If this code path is hit, bufferLength is still a char count, not a byte count.
-            // We skipped the code path that multiplied the count by sizeof(char).
-
-            Debug.Assert(bufferLength < SizeOfVector128InChars);
-
-            // QWORD drain
-
-            if ((bufferLength & 4) != 0)
-            {
-                if (Bmi1.X64.IsSupported)
-                {
-                    // If we can use 64-bit tzcnt to count the number of leading ASCII chars, prefer it.
-
-                    ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
-                    if (!AllCharsInUInt64AreAscii(candidateUInt64))
-                    {
-                        // Clear the low 7 bits (the ASCII bits) of each char, then tzcnt.
-                        // Remember the / 8 at the end to convert bit count to byte count,
-                        // then the & ~1 at the end to treat a match in the high byte of
-                        // any char the same as a match in the low byte of that same char.
-
-                        candidateUInt64 &= 0xFF80FF80_FF80FF80ul;
-                        pBuffer = (char*)((byte*)pBuffer + ((nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8) & ~(nuint)1));
-                        goto Finish;
-                    }
-                }
-                else
-                {
-                    // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.
-
-                    currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
-                    uint nextDWord = Unsafe.ReadUnaligned<uint>(pBuffer + 4 / sizeof(char));
-
-                    if (!AllCharsInUInt32AreAscii(currentDWord | nextDWord))
-                    {
-                        // At least one of the values wasn't all-ASCII.
-                        // We need to figure out which one it was and stick it in the currentMask local.
-
-                        if (AllCharsInUInt32AreAscii(currentDWord))
-                        {
-                            currentDWord = nextDWord; // this one is the culprit
-                            pBuffer += 4 / sizeof(char);
-                        }
-
-                        goto FoundNonAsciiDataInCurrentDWord;
-                    }
-                }
-
-                pBuffer += 4; // successfully consumed 4 ASCII chars
-            }
-
-            // DWORD drain
-
-            if ((bufferLength & 2) != 0)
-            {
-                currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
-
-                if (!AllCharsInUInt32AreAscii(currentDWord))
-                {
-                    goto FoundNonAsciiDataInCurrentDWord;
-                }
-
-                pBuffer += 2; // successfully consumed 2 ASCII chars
-            }
-
-            // WORD drain
-            // This is the final drain; there's no need for a BYTE drain since our elemental type is 16-bit char.
-
-            if ((bufferLength & 1) != 0)
-            {
-                if (*pBuffer <= 0x007F)
-                {
-                    pBuffer++; // successfully consumed a single char
-                }
-            }
-
-            goto Finish;
-        }
-
-        /// <summary>
-        /// Given a QWORD which represents a buffer of 4 ASCII chars in machine-endian order,
-        /// narrows each WORD to a BYTE, then writes the 4-byte result to the output buffer
-        /// also in machine-endian order.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref byte outputBuffer, ulong value)
-        {
-            Debug.Assert(AllCharsInUInt64AreAscii(value));
-
-            if (Bmi2.X64.IsSupported)
-            {
-                // BMI2 will work regardless of the processor's endianness.
-                Unsafe.WriteUnaligned(ref outputBuffer, (uint)Bmi2.X64.ParallelBitExtract(value, 0x00FF00FF_00FF00FFul));
-            }
-            else
-            {
-                if (BitConverter.IsLittleEndian)
-                {
-                    outputBuffer = (byte)value;
-                    value >>= 16;
-                    Unsafe.Add(ref outputBuffer, 1) = (byte)value;
-                    value >>= 16;
-                    Unsafe.Add(ref outputBuffer, 2) = (byte)value;
-                    value >>= 16;
-                    Unsafe.Add(ref outputBuffer, 3) = (byte)value;
-                }
-                else
-                {
-                    Unsafe.Add(ref outputBuffer, 3) = (byte)value;
-                    value >>= 16;
-                    Unsafe.Add(ref outputBuffer, 2) = (byte)value;
-                    value >>= 16;
-                    Unsafe.Add(ref outputBuffer, 1) = (byte)value;
-                    value >>= 16;
-                    outputBuffer = (byte)value;
-                }
-            }
-        }
-
-        /// <summary>
-        /// Given a DWORD which represents a buffer of 2 ASCII chars in machine-endian order,
-        /// narrows each WORD to a BYTE, then writes the 2-byte result to the output buffer also in
-        /// machine-endian order.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref byte outputBuffer, uint value)
-        {
-            Debug.Assert(AllCharsInUInt32AreAscii(value));
-
-            if (BitConverter.IsLittleEndian)
-            {
-                outputBuffer = (byte)value;
-                Unsafe.Add(ref outputBuffer, 1) = (byte)(value >> 16);
-            }
-            else
-            {
-                Unsafe.Add(ref outputBuffer, 1) = (byte)value;
-                outputBuffer = (byte)(value >> 16);
-            }
-        }
-
-        /// <summary>
-        /// Copies as many ASCII characters (U+0000..U+007F) as possible from <paramref name="pUtf16Buffer"/>
-        /// to <paramref name="pAsciiBuffer"/>, stopping when the first non-ASCII character is encountered
-        /// or once <paramref name="elementCount"/> elements have been converted. Returns the total number
-        /// of elements that were able to be converted.
-        /// </summary>
-        public static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
-        {
-            nuint currentOffset = 0;
-
-            uint utf16Data32BitsHigh = 0, utf16Data32BitsLow = 0;
-            ulong utf16Data64Bits = 0;
-
-            // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
-            // code below. This has two benefits: (a) we can take advantage of specific instructions like
-            // pmovmskb, ptest, vpminuw which we know are optimized, and (b) we can avoid downclocking the
-            // processor while this method is running.
-
-            if (Sse2.IsSupported)
-            {
-                Debug.Assert(BitConverter.IsLittleEndian, "Assume little endian if SSE2 is supported.");
-
-                if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
-                {
-                    // Since there's overhead to setting up the vectorized code path, we only want to
-                    // call into it after a quick probe to ensure the next immediate characters really are ASCII.
-                    // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method.
-
-                    if (PlatformDependent.Is64BitProcess)
-                    {
-                        utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer);
-                        if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
-                        {
-                            goto FoundNonAsciiDataIn64BitRead;
-                        }
-                    }
-                    else
-                    {
-                        utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer);
-                        utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + 4 / sizeof(char));
-                        if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
-                        {
-                            goto FoundNonAsciiDataIn64BitRead;
-                        }
-                    }
-
-                    currentOffset = NarrowUtf16ToAscii_Sse2(pUtf16Buffer, pAsciiBuffer, elementCount);
-                }
-            }
-            else if (Vector.IsHardwareAccelerated)
-            {
-                uint SizeOfVector = (uint)Unsafe.SizeOf<Vector<byte>>(); // JIT will make this a const
-
-                // Only bother vectorizing if we have enough data to do so.
-                if (elementCount >= 2 * SizeOfVector)
-                {
-                    // Since there's overhead to setting up the vectorized code path, we only want to
-                    // call into it after a quick probe to ensure the next immediate characters really are ASCII.
-                    // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method.
-
-                    if (PlatformDependent.Is64BitProcess)
-                    {
-                        utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer);
-                        if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
-                        {
-                            goto FoundNonAsciiDataIn64BitRead;
-                        }
-                    }
-                    else
-                    {
-                        utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer);
-                        utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + 4 / sizeof(char));
-                        if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
-                        {
-                            goto FoundNonAsciiDataIn64BitRead;
-                        }
-                    }
-
-                    Vector<ushort> maxAscii = new Vector<ushort>(0x007F);
-
-                    nuint finalOffsetWhereCanLoop = elementCount - 2 * SizeOfVector;
-                    do
-                    {
-                        Vector<ushort> utf16VectorHigh = Unsafe.ReadUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset);
-                        Vector<ushort> utf16VectorLow = Unsafe.ReadUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset + Vector<ushort>.Count);
-
-                        if (Vector.GreaterThanAny(Vector.BitwiseOr(utf16VectorHigh, utf16VectorLow), maxAscii))
-                        {
-                            break; // found non-ASCII data
-                        }
-
-                        // TODO: Is the below logic also valid for big-endian platforms?
-                        Vector<byte> asciiVector = Vector.Narrow(utf16VectorHigh, utf16VectorLow);
-                        Unsafe.WriteUnaligned<Vector<byte>>(pAsciiBuffer + currentOffset, asciiVector);
-
-                        currentOffset += SizeOfVector;
-                    } while (currentOffset <= finalOffsetWhereCanLoop);
-                }
-            }
-
-            Debug.Assert(currentOffset <= elementCount);
-            nuint remainingElementCount = elementCount - currentOffset;
-
-            // Try to narrow 64 bits -> 32 bits at a time.
-            // We needn't update remainingElementCount after this point.
-
-            if (remainingElementCount >= 4)
-            {
-                nuint finalOffsetWhereCanLoop = currentOffset + remainingElementCount - 4;
-                do
-                {
-                    if (PlatformDependent.Is64BitProcess)
-                    {
-                        // Only perform QWORD reads on a 64-bit platform.
-                        utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer + currentOffset);
-                        if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
-                        {
-                            goto FoundNonAsciiDataIn64BitRead;
-                        }
-
-                        NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data64Bits);
-                    }
-                    else
-                    {
-                        utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + currentOffset);
-                        utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + currentOffset + 4 / sizeof(char));
-                        if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
-                        {
-                            goto FoundNonAsciiDataIn64BitRead;
-                        }
-
-                        NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
-                        NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset + 2], utf16Data32BitsLow);
-                    }
-
-                    currentOffset += 4;
-                } while (currentOffset <= finalOffsetWhereCanLoop);
-            }
-
-            // Try to narrow 32 bits -> 16 bits.
-
-            if (((uint)remainingElementCount & 2) != 0)
-            {
-                utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + currentOffset);
-                if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh))
-                {
-                    goto FoundNonAsciiDataInHigh32Bits;
-                }
-
-                NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
-                currentOffset += 2;
-            }
-
-            // Try to narrow 16 bits -> 8 bits.
-
-            if (((uint)remainingElementCount & 1) != 0)
-            {
-                utf16Data32BitsHigh = pUtf16Buffer[currentOffset];
-                if (utf16Data32BitsHigh <= 0x007Fu)
-                {
-                    pAsciiBuffer[currentOffset] = (byte)utf16Data32BitsHigh;
-                    currentOffset++;
-                }
-            }
-
-        Finish:
-
-            return currentOffset;
-
-        FoundNonAsciiDataIn64BitRead:
-
-            if (PlatformDependent.Is64BitProcess)
-            {
-                // Try checking the first 32 bits of the buffer for non-ASCII data.
-                // Regardless, we'll move the non-ASCII data into the utf16Data32BitsHigh local.
-
-                if (BitConverter.IsLittleEndian)
-                {
-                    utf16Data32BitsHigh = (uint)utf16Data64Bits;
-                }
-                else
-                {
-                    utf16Data32BitsHigh = (uint)(utf16Data64Bits >> 32);
-                }
-
-                if (AllCharsInUInt32AreAscii(utf16Data32BitsHigh))
-                {
-                    NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
-
-                    if (BitConverter.IsLittleEndian)
-                    {
-                        utf16Data32BitsHigh = (uint)(utf16Data64Bits >> 32);
-                    }
-                    else
-                    {
-                        utf16Data32BitsHigh = (uint)utf16Data64Bits;
-                    }
-
-                    currentOffset += 2;
-                }
-            }
-            else
-            {
-                // Need to determine if the high or the low 32-bit value contained non-ASCII data.
-                // Regardless, we'll move the non-ASCII data into the utf16Data32BitsHigh local.
-
-                if (AllCharsInUInt32AreAscii(utf16Data32BitsHigh))
-                {
-                    NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref pAsciiBuffer[currentOffset], utf16Data32BitsHigh);
-                    utf16Data32BitsHigh = utf16Data32BitsLow;
-                    currentOffset += 2;
-                }
-            }
-
-        FoundNonAsciiDataInHigh32Bits:
-
-            Debug.Assert(!AllCharsInUInt32AreAscii(utf16Data32BitsHigh), "Shouldn't have reached this point if we have an all-ASCII input.");
-
-            // There's at most one char that needs to be drained.
-
-            if (FirstCharInUInt32IsAscii(utf16Data32BitsHigh))
-            {
-                if (!BitConverter.IsLittleEndian)
-                {
-                    utf16Data32BitsHigh >>= 16; // move high char down to low char
-                }
-
-                pAsciiBuffer[currentOffset] = (byte)utf16Data32BitsHigh;
-                currentOffset++;
-            }
-
-            goto Finish;
-        }
-
-        private static unsafe nuint NarrowUtf16ToAscii_Sse2(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
-        {
-            // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
-            // will be elided by JIT once we determine which specific ISAs we support.
-
-            // JIT turns the below into constants
-
-            uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
-            nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
-
-            // This method is written such that control generally flows top-to-bottom, avoiding
-            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
-            // data, we jump out of the hot paths to targets at the end of the method.
-
-            Debug.Assert(Sse2.IsSupported);
-            Debug.Assert(BitConverter.IsLittleEndian);
-            Debug.Assert(elementCount >= 2 * SizeOfVector128);
-
-            Vector128<short> asciiMaskForPTEST = Vector128.Create(unchecked((short)0xFF80)); // used for PTEST on supported hardware
-            Vector128<short> asciiMaskForPXOR = Vector128.Create(unchecked((short)0x8000)); // used for PXOR
-            Vector128<short> asciiMaskForPCMPGTW = Vector128.Create(unchecked((short)0x807F)); // used for PCMPGTW
-
-            // First, perform an unaligned read of the first part of the input buffer.
-
-            Vector128<short> utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer); // unaligned load
-
-            // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
-            // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
-
-            if (Sse41.IsSupported)
-            {
-                if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
-                {
-                    return 0;
-                }
-            }
-            else
-            {
-                if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
-                {
-                    return 0;
-                }
-            }
-
-            // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
-
-            Vector128<byte> asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
-            Sse2.StoreScalar((ulong*)pAsciiBuffer, asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
-
-            nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far
-
-            // We're going to get the best performance when we have aligned writes, so we'll take the
-            // hit of potentially unaligned reads in order to hit this sweet spot.
-
-            // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote
-            // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote
-            // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In
-            // that case we can immediately back up to the previous aligned boundary and start the main loop.
-            // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at
-            // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump
-            // just past the next aligned boundary address.
-
-            if (0u >= ((uint)pAsciiBuffer & (SizeOfVector128 / 2)))
-            {
-                // We need to perform one more partial vector write before we can get the alignment we want.
-
-                utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements); // unaligned load
-
-                // See comments earlier in this method for information about how this works.
-                if (Sse41.IsSupported)
-                {
-                    if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
-                    {
-                        goto Finish;
-                    }
-                }
-                else
-                {
-                    if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
-                    {
-                        goto Finish;
-                    }
-                }
-
-                // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
-                asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
-                Sse2.StoreScalar((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
-            }
-
-            // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
-            // point, then use that as the base offset going forward.
-
-            currentOffsetInElements = SizeOfVector128 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector128);
-            Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector128, "We wrote at least 1 byte but no more than a whole vector.");
-
-            Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer.");
-            Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector128, "We should be able to run at least one whole vector.");
-
-            nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;
-            do
-            {
-                // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector.
-
-                utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements); // unaligned load
-                Vector128<short> utf16VectorSecond = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements + SizeOfVector128 / sizeof(short)); // unaligned load
-                Vector128<short> combinedVector = Sse2.Or(utf16VectorFirst, utf16VectorSecond);
-
-                // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
-                if (Sse41.IsSupported)
-                {
-                    if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
-                    {
-                        goto FoundNonAsciiDataInLoop;
-                    }
-                }
-                else
-                {
-                    if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
-                    {
-                        goto FoundNonAsciiDataInLoop;
-                    }
-                }
-
-                // Build up the UTF-8 vector and perform the store.
-
-                asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorSecond);
-
-                Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector128 == 0, "Write should be aligned.");
-                Sse2.StoreAligned(pAsciiBuffer + currentOffsetInElements, asciiVector); // aligned
-
-                currentOffsetInElements += SizeOfVector128;
-            } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop);
-
-        Finish:
-
-            // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain.
-            return currentOffsetInElements;
-
-        FoundNonAsciiDataInLoop:
-
-            // Can we at least narrow the high vector?
-            // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
-            if (Sse41.IsSupported)
-            {
-                if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
-                {
-                    goto Finish; // found non-ASCII data
-                }
-            }
-            else
-            {
-                if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
-                {
-                    goto Finish; // found non-ASCII data
-                }
-            }
-
-            // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector.
-            asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
-
-            Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
-
-            Sse2.StoreScalar((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is aligned
-            currentOffsetInElements += SizeOfVector128 / 2;
-
-            goto Finish;
-        }
-
-        /// <summary>
-        /// Copies as many ASCII bytes (00..7F) as possible from <paramref name="pAsciiBuffer"/>
-        /// to <paramref name="pUtf16Buffer"/>, stopping when the first non-ASCII byte is encountered
-        /// or once <paramref name="elementCount"/> elements have been converted. Returns the total number
-        /// of elements that were able to be converted.
-        /// </summary>
-        public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
-        {
-            nuint currentOffset = 0;
-
-            // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
-            // code below. This has two benefits: (a) we can take advantage of specific instructions like
-            // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
-            // this method is running.
-
-            if (Sse2.IsSupported)
-            {
-                if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
-                {
-                    currentOffset = WidenAsciiToUtf16_Sse2(pAsciiBuffer, pUtf16Buffer, elementCount);
-                }
-            }
-            else if (Vector.IsHardwareAccelerated)
-            {
-                uint SizeOfVector = (uint)Unsafe.SizeOf<Vector<byte>>(); // JIT will make this a const
-
-                // Only bother vectorizing if we have enough data to do so.
-                if (elementCount >= SizeOfVector)
-                {
-                    // Note use of SBYTE instead of BYTE below; we're using the two's-complement
-                    // representation of negative integers to act as a surrogate for "is ASCII?".
-
-                    nuint finalOffsetWhereCanLoop = elementCount - SizeOfVector;
-                    do
-                    {
-                        Vector<sbyte> asciiVector = Unsafe.ReadUnaligned<Vector<sbyte>>(pAsciiBuffer + currentOffset);
-                        if (Vector.LessThanAny(asciiVector, Vector<sbyte>.Zero))
-                        {
-                            break; // found non-ASCII data
-                        }
-
-                        Vector.Widen(Vector.AsVectorByte(asciiVector), out Vector<ushort> utf16LowVector, out Vector<ushort> utf16HighVector);
-
-                        // TODO: Is the below logic also valid for big-endian platforms?
-                        Unsafe.WriteUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset, utf16LowVector);
-                        Unsafe.WriteUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset + Vector<ushort>.Count, utf16HighVector);
-
-                        currentOffset += SizeOfVector;
-                    } while (currentOffset <= finalOffsetWhereCanLoop);
-                }
-            }
-
-            Debug.Assert(currentOffset <= elementCount);
-            nuint remainingElementCount = elementCount - currentOffset;
-
-            // Try to widen 32 bits -> 64 bits at a time.
-            // We needn't update remainingElementCount after this point.
-
-            uint asciiData;
-
-            if (remainingElementCount >= 4)
-            {
-                nuint finalOffsetWhereCanLoop = currentOffset + remainingElementCount - 4;
-                do
-                {
-                    asciiData = Unsafe.ReadUnaligned<uint>(pAsciiBuffer + currentOffset);
-                    if (!ASCIIUtility.AllBytesInUInt32AreAscii(asciiData))
-                    {
-                        goto FoundNonAsciiData;
-                    }
-
-                    WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pUtf16Buffer[currentOffset], asciiData);
-                    currentOffset += 4;
-                } while (currentOffset <= finalOffsetWhereCanLoop);
-            }
-
-            // Try to widen 16 bits -> 32 bits.
-
-            if (((uint)remainingElementCount & 2) != 0)
-            {
-                asciiData = Unsafe.ReadUnaligned<ushort>(pAsciiBuffer + currentOffset);
-                if (!ASCIIUtility.AllBytesInUInt32AreAscii(asciiData))
-                {
-                    goto FoundNonAsciiData;
-                }
-
-                if (BitConverter.IsLittleEndian)
-                {
-                    pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
-                    pUtf16Buffer[currentOffset + 1] = (char)(asciiData >> 8);
-                }
-                else
-                {
-                    pUtf16Buffer[currentOffset + 1] = (char)(byte)asciiData;
-                    pUtf16Buffer[currentOffset] = (char)(asciiData >> 8);
-                }
-
-                currentOffset += 2;
-            }
-
-            // Try to widen 8 bits -> 16 bits.
-
-            if (((uint)remainingElementCount & 1) != 0)
-            {
-                asciiData = pAsciiBuffer[currentOffset];
-                if (((byte)asciiData & 0x80) != 0)
-                {
-                    goto Finish;
-                }
-
-                pUtf16Buffer[currentOffset] = (char)asciiData;
-                currentOffset += 1;
-            }
-
-        Finish:
-
-            return currentOffset;
-
-        FoundNonAsciiData:
-
-            Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(asciiData), "Shouldn't have reached this point if we have an all-ASCII input.");
-
-            // Drain ASCII bytes one at a time.
-
-            while (0u >= (uint)((byte)asciiData & 0x80))
-            {
-                pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
-                currentOffset += 1;
-                asciiData >>= 8;
-            }
-
-            goto Finish;
-        }
-
-        private static unsafe nuint WidenAsciiToUtf16_Sse2(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
-        {
-            // JIT turns the below into constants
-
-            uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
-            nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
-
-            // This method is written such that control generally flows top-to-bottom, avoiding
-            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
-            // data, we jump out of the hot paths to targets at the end of the method.
-
-            Debug.Assert(Sse2.IsSupported);
-            Debug.Assert(BitConverter.IsLittleEndian);
-            Debug.Assert(elementCount >= 2 * SizeOfVector128);
-
-            // We're going to get the best performance when we have aligned writes, so we'll take the
-            // hit of potentially unaligned reads in order to hit this sweet spot.
-
-            Vector128<byte> asciiVector;
-            Vector128<byte> utf16FirstHalfVector;
-            uint mask;
-
-            // First, perform an unaligned read of the first part of the input buffer.
-
-            asciiVector = Sse2.LoadVector128(pAsciiBuffer); // unaligned load
-            mask = (uint)Sse2.MoveMask(asciiVector);
-
-            // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
-
-            if ((byte)mask != 0)
-            {
-                return 0;
-            }
-
-            // Then perform an unaligned write of the first part of the input buffer.
-
-            Vector128<byte> zeroVector = Vector128<byte>.Zero;
-
-            utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
-            Sse2.Store((byte*)pUtf16Buffer, utf16FirstHalfVector); // unaligned
-
-            // Calculate how many elements we wrote in order to get pOutputBuffer to its next alignment
-            // point, then use that as the base offset going forward. Remember the >> 1 to account for
-            // that we wrote chars, not bytes. This means we may re-read data in the next iteration of
-            // the loop, but this is ok.
-
-            nuint currentOffset = (SizeOfVector128 >> 1) - (((nuint)pUtf16Buffer >> 1) & (MaskOfAllBitsInVector128 >> 1));
-            Debug.Assert(0 < currentOffset && currentOffset <= SizeOfVector128 / sizeof(char));
-
-            nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;
-
-            do
-            {
-                // In a loop, perform an unaligned read, widen to two vectors, then aligned write the two vectors.
-
-                asciiVector = Sse2.LoadVector128(pAsciiBuffer + currentOffset); // unaligned load
-                mask = (uint)Sse2.MoveMask(asciiVector);
-
-                if (mask != 0)
-                {
-                    // non-ASCII byte somewhere
-                    goto NonAsciiDataSeenInInnerLoop;
-                }
-
-                byte* pStore = (byte*)(pUtf16Buffer + currentOffset);
-                Sse2.StoreAligned(pStore, Sse2.UnpackLow(asciiVector, zeroVector));
-
-                pStore += SizeOfVector128;
-                Sse2.StoreAligned(pStore, Sse2.UnpackHigh(asciiVector, zeroVector));
-
-                currentOffset += SizeOfVector128;
-            } while (currentOffset <= finalOffsetWhereCanRunLoop);
-
-        Finish:
-
-            return currentOffset;
-
-        NonAsciiDataSeenInInnerLoop:
-
-            // Can we at least widen the first part of the vector?
-
-            if (0u >= ((byte)mask))
-            {
-                // First part was all ASCII, widen
-                utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
-                Sse2.StoreAligned((byte*)(pUtf16Buffer + currentOffset), utf16FirstHalfVector);
-                currentOffset += SizeOfVector128 / 2;
-            }
-
-            goto Finish;
-        }
-
-        /// <summary>
-        /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
-        /// writes them to the output buffer with machine endianness.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value)
-        {
-            Debug.Assert(ASCIIUtility.AllBytesInUInt32AreAscii(value));
-
-            if (Bmi2.X64.IsSupported)
-            {
-                // BMI2 will work regardless of the processor's endianness.
-                Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul));
-            }
-            else
-            {
-                if (BitConverter.IsLittleEndian)
-                {
-                    outputBuffer = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 3) = (char)value;
-                }
-                else
-                {
-                    Unsafe.Add(ref outputBuffer, 3) = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
-                    value >>= 8;
-                    outputBuffer = (char)value;
-                }
-            }
-        }
-    }
-}
-#endif
diff --git a/src/DotNetty.Common/Internal/PlatformDependent.cs b/src/DotNetty.Common/Internal/PlatformDependent.cs
index 09de54e88..b98391d6e 100644
--- a/src/DotNetty.Common/Internal/PlatformDependent.cs
+++ b/src/DotNetty.Common/Internal/PlatformDependent.cs
@@ -59,7 +59,7 @@ public static unsafe bool ByteArrayEquals(byte[] bytes1, int startPos1, byte[] b
                 return true;
             }
 
-            return SpanHelpers.SequenceEqual(ref bytes1[startPos1], ref bytes2[startPos2], unchecked((uint)length));
+            return SpanHelpers.SequenceEqual(ref bytes1[startPos1], ref bytes2[startPos2], length);
         }
 
         public static unsafe int ByteArrayEqualsConstantTime(byte[] bytes1, int startPos1, byte[] bytes2, int startPos2, int length)
diff --git a/src/DotNetty.Common/Internal/SpanHelpers.Byte.cs b/src/DotNetty.Common/Internal/SpanHelpers.Byte.cs
index d79bad49a..e16b0198a 100644
--- a/src/DotNetty.Common/Internal/SpanHelpers.Byte.cs
+++ b/src/DotNetty.Common/Internal/SpanHelpers.Byte.cs
@@ -427,22 +427,12 @@ public static unsafe bool Contains(ref byte searchSpace, byte value, int length)
         // Optimized byte-based SequenceEquals. The "length" parameter for this one is declared a nuint rather than int as we also use it for types other than byte
         // where the length can exceed 2Gb once scaled by sizeof(T).
         //[MethodImpl(MethodImplOptions.AggressiveOptimization)]
-        public static unsafe bool SequenceEqual(ref byte first, ref byte second, long length)
+        public static unsafe bool SequenceEqual(ref byte first, ref byte second, nint length)
         {
             if (Unsafe.AreSame(ref first, ref second)) { goto Equal; }
 
             IntPtr offset = (IntPtr)0; // Use IntPtr for arithmetic to avoid unnecessary 64->32->64 truncations
-            IntPtr lengthToExamine;
-            if (PlatformDependent.Is64BitProcess)
-            {
-                ulong nlen = unchecked((ulong)length);
-                lengthToExamine = (IntPtr)(void*)nlen;
-            }
-            else
-            {
-                uint nlen = unchecked((uint)length);
-                lengthToExamine = (IntPtr)(void*)nlen;
-            }
+            IntPtr lengthToExamine = (IntPtr)(void*)((nuint)length);
 
             if (Vector.IsHardwareAccelerated && (byte*)lengthToExamine >= (byte*)Vector<byte>.Count)
             {
@@ -692,7 +682,7 @@ public static int IndexOf(ref byte searchSpace, int searchSpaceLength, ref byte
             {
                 return 0;  // A zero-length sequence is always treated as "found" at the start of the search space.
             }
-            if (1u >= (uValueLength))
+            if (1u >= uValueLength)
             {
                 return IndexOf(ref searchSpace, value, searchSpaceLength);
             }
@@ -809,16 +799,7 @@ public static unsafe int IndexOf(ref byte searchSpace, byte value, int length)
             {
                 if ((int)(byte*)offset < length)
                 {
-                    bool isAlignedToVector128;
-                    if (PlatformDependent.Is64BitProcess)
-                    {
-                        isAlignedToVector128 = (((ulong)Unsafe.AsPointer(ref searchSpace) + (ulong)offset) & (ulong)(Vector256<byte>.Count - 1)) != 0;
-                    }
-                    else
-                    {
-                        isAlignedToVector128 = (((uint)Unsafe.AsPointer(ref searchSpace) + (uint)offset) & (uint)(Vector256<byte>.Count - 1)) != 0;
-                    }
-                    if (isAlignedToVector128)
+                    if ((((nuint)Unsafe.AsPointer(ref searchSpace) + (nuint)(nint)offset) & (nuint)(Vector256<byte>.Count - 1)) != 0)
                     {
                         // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches
                         // with no upper bound e.g. String.strlen.
diff --git a/src/DotNetty.Common/Internal/SpanHelpers.Char.cs b/src/DotNetty.Common/Internal/SpanHelpers.Char.cs
index 689ae8319..87923c682 100644
--- a/src/DotNetty.Common/Internal/SpanHelpers.Char.cs
+++ b/src/DotNetty.Common/Internal/SpanHelpers.Char.cs
@@ -21,9 +21,7 @@ public static unsafe bool Contains(ref char searchSpace, char value, int length)
             Debug.Assert(length >= 0);
 
 #if NETCOREAPP_3_0_GREATER
-            int index = PlatformDependent.Is64BitProcess
-                ? InternalIndexOf_x64(ref searchSpace, value, length)
-                : InternalIndexOf_x32(ref searchSpace, value, length);
+            int index = IndexOf(ref searchSpace, value, length);
             return SharedConstants.TooBigOrNegative >= (uint)index;
 #else
             fixed (char* pChars = &searchSpace)
@@ -300,7 +298,7 @@ public static int IndexOf(ref char searchSpace, int searchSpaceLength, ref char
                 if (SequenceEqual(
                     ref Unsafe.As<char, byte>(ref Unsafe.Add(ref searchSpace, index + 1)),
                     ref Unsafe.As<char, byte>(ref valueTail),
-                    (long)valueTailLength * 2)) // nuint
+                    (nint)valueTailLength * 2)) // nuint
                 {
                     return index;  // The tail matched. Return a successful find.
                 }
@@ -316,106 +314,8 @@ public static unsafe int IndexOf(ref char searchSpace, char value, int length)
             Debug.Assert(length >= 0);
 
 #if NETCOREAPP_3_0_GREATER
-            return PlatformDependent.Is64BitProcess
-                ? InternalIndexOf_x64(ref searchSpace, value, length)
-                : InternalIndexOf_x32(ref searchSpace, value, length);
-#else
-            fixed (char* pChars = &searchSpace)
-            {
-                char* pCh = pChars;
-                char* pEndCh = pCh + length;
-
-                if (Vector.IsHardwareAccelerated && length >= Vector<ushort>.Count * 2)
-                {
-                    // Figure out how many characters to read sequentially until we are vector aligned
-                    // This is equivalent to:
-                    //         unaligned = ((int)pCh % Unsafe.SizeOf<Vector<ushort>>()) / elementsPerByte
-                    //         length = (Vector<ushort>.Count - unaligned) % Vector<ushort>.Count
-                    const int elementsPerByte = sizeof(ushort) / sizeof(byte);
-                    int unaligned = ((int)pCh & (Unsafe.SizeOf<Vector<ushort>>() - 1)) / elementsPerByte;
-                    length = (Vector<ushort>.Count - unaligned) & (Vector<ushort>.Count - 1);
-                }
-
-            SequentialScan:
-                while (length >= 4)
-                {
-                    length -= 4;
-
-                    if (pCh[0] == value)
-                        goto Found;
-                    if (pCh[1] == value)
-                        goto Found1;
-                    if (pCh[2] == value)
-                        goto Found2;
-                    if (pCh[3] == value)
-                        goto Found3;
-
-                    pCh += 4;
-                }
-
-                while (length > 0)
-                {
-                    length--;
-
-                    if (pCh[0] == value)
-                        goto Found;
-
-                    pCh++;
-                }
-
-                // We get past SequentialScan only if IsHardwareAccelerated is true. However, we still have the redundant check to allow
-                // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated.
-                if (Vector.IsHardwareAccelerated && pCh < pEndCh)
-                {
-                    // Get the highest multiple of Vector<ushort>.Count that is within the search space.
-                    // That will be how many times we iterate in the loop below.
-                    // This is equivalent to: length = Vector<ushort>.Count * ((int)(pEndCh - pCh) / Vector<ushort>.Count)
-                    length = (int)((pEndCh - pCh) & ~(Vector<ushort>.Count - 1));
-
-                    // Get comparison Vector
-                    Vector<ushort> vComparison = new Vector<ushort>(value);
-
-                    while (length > 0)
-                    {
-                        // Using Unsafe.Read instead of ReadUnaligned since the search space is pinned and pCh is always vector aligned
-                        Debug.Assert(((int)pCh & (Unsafe.SizeOf<Vector<ushort>>() - 1)) == 0);
-                        Vector<ushort> vMatches = Vector.Equals(vComparison, Unsafe.Read<Vector<ushort>>(pCh));
-                        if (Vector<ushort>.Zero.Equals(vMatches))
-                        {
-                            pCh += Vector<ushort>.Count;
-                            length -= Vector<ushort>.Count;
-                            continue;
-                        }
-                        // Find offset of first match
-                        return (int)(pCh - pChars) + LocateFirstFoundChar(vMatches);
-                    }
-
-                    if (pCh < pEndCh)
-                    {
-                        length = (int)(pEndCh - pCh);
-                        goto SequentialScan;
-                    }
-                }
-
-                return -1;
-            Found3:
-                pCh++;
-            Found2:
-                pCh++;
-            Found1:
-                pCh++;
-            Found:
-                return (int)(pCh - pChars);
-            }
-#endif
-        }
-
-#if NETCOREAPP_3_0_GREATER
-        [MethodImpl(MethodImplOptions.AggressiveOptimization)]
-        private static unsafe int InternalIndexOf_x64(ref char searchSpace, char value, int length)
-        {
-            long offset = 0L;
-            long lengthToExamine = length;
+            nint offset = 0;
+            nint lengthToExamine = length;
 
             if (((int)Unsafe.AsPointer(ref searchSpace) & 1) != 0)
             {
@@ -427,7 +327,7 @@ private static unsafe int InternalIndexOf_x64(ref char searchSpace, char value,
                 // Needs to be double length to allow us to align the data first.
                 if (length >= Vector128<ushort>.Count * 2)
                 {
-                    lengthToExamine = UnalignedCountVector128_x64(ref searchSpace);
+                    lengthToExamine = UnalignedCountVector128(ref searchSpace);
                 }
             }
             else if (Vector.IsHardwareAccelerated)
@@ -435,7 +335,7 @@ private static unsafe int InternalIndexOf_x64(ref char searchSpace, char value,
                 // Needs to be double length to allow us to align the data first.
                 if (length >= Vector<ushort>.Count * 2)
                 {
-                    lengthToExamine = UnalignedCountVector_x64(ref searchSpace);
+                    lengthToExamine = UnalignedCountVector(ref searchSpace);
                 }
             }
 
@@ -477,7 +377,7 @@ private static unsafe int InternalIndexOf_x64(ref char searchSpace, char value,
                 if (offset < length)
                 {
                     Debug.Assert(length - offset >= Vector128<ushort>.Count);
-                    if (((long)Unsafe.AsPointer(ref Unsafe.Add(ref searchSpace, (IntPtr)offset)) & (long)(Vector256<byte>.Count - 1)) != 0)
+                    if (((nint)Unsafe.AsPointer(ref Unsafe.Add(ref searchSpace, (IntPtr)offset)) & (nint)(Vector256<byte>.Count - 1)) != 0)
                     {
                         // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches
                         // with no upper bound e.g. String.wcslen. Start with a check on Vector128 to align to Vector256, 
@@ -648,247 +548,96 @@ private static unsafe int InternalIndexOf_x64(ref char searchSpace, char value,
             return (int)(offset + 1);
         Found:
             return (int)(offset);
-        }
-
-        [MethodImpl(MethodImplOptions.NoInlining)]
-        private static unsafe int InternalIndexOf_x32(ref char searchSpace, char value, int length)
-        {
-            int offset = 0;
-            int lengthToExamine = length;
-
-            if (((int)Unsafe.AsPointer(ref searchSpace) & 1) != 0)
-            {
-                // Input isn't char aligned, we won't be able to align it to a Vector
-            }
-            else if (Sse2.IsSupported)
-            {
-                // Avx2 branch also operates on Sse2 sizes, so check is combined.
-                // Needs to be double length to allow us to align the data first.
-                if (length >= Vector128<ushort>.Count * 2)
-                {
-                    lengthToExamine = UnalignedCountVector128_x32(ref searchSpace);
-                }
-            }
-            else if (Vector.IsHardwareAccelerated)
+#else
+            fixed (char* pChars = &searchSpace)
             {
-                // Needs to be double length to allow us to align the data first.
-                if (length >= Vector<ushort>.Count * 2)
+                char* pCh = pChars;
+                char* pEndCh = pCh + length;
+
+                if (Vector.IsHardwareAccelerated && length >= Vector<ushort>.Count * 2)
                 {
-                    lengthToExamine = UnalignedCountVector_x32(ref searchSpace);
+                    // Figure out how many characters to read sequentially until we are vector aligned
+                    // This is equivalent to:
+                    //         unaligned = ((int)pCh % Unsafe.SizeOf<Vector<ushort>>()) / elementsPerByte
+                    //         length = (Vector<ushort>.Count - unaligned) % Vector<ushort>.Count
+                    const int elementsPerByte = sizeof(ushort) / sizeof(byte);
+                    int unaligned = ((int)pCh & (Unsafe.SizeOf<Vector<ushort>>() - 1)) / elementsPerByte;
+                    length = (Vector<ushort>.Count - unaligned) & (Vector<ushort>.Count - 1);
                 }
-            }
 
-        SequentialScan:
-            // In the non-vector case lengthToExamine is the total length.
-            // In the vector case lengthToExamine first aligns to Vector,
-            // then in a second pass after the Vector lengths is the 
-            // remaining data that is shorter than a Vector length.
-            while (lengthToExamine >= 4)
-            {
-                ref char current = ref Add(ref searchSpace, offset);
-
-                if (value == current)
-                    goto Found;
-                if (value == Add(ref current, 1))
-                    goto Found1;
-                if (value == Add(ref current, 2))
-                    goto Found2;
-                if (value == Add(ref current, 3))
-                    goto Found3;
-
-                offset += 4;
-                lengthToExamine -= 4;
-            }
+            SequentialScan:
+                while (length >= 4)
+                {
+                    length -= 4;
 
-            while (lengthToExamine > 0)
-            {
-                if (value == Add(ref searchSpace, offset))
-                    goto Found;
+                    if (pCh[0] == value)
+                        goto Found;
+                    if (pCh[1] == value)
+                        goto Found1;
+                    if (pCh[2] == value)
+                        goto Found2;
+                    if (pCh[3] == value)
+                        goto Found3;
 
-                offset += 1;
-                lengthToExamine -= 1;
-            }
+                    pCh += 4;
+                }
 
-            // We get past SequentialScan only if IsHardwareAccelerated or intrinsic .IsSupported is true. However, we still have the redundant check to allow
-            // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated.
-            if (Avx2.IsSupported)
-            {
-                if (offset < length)
+                while (length > 0)
                 {
-                    Debug.Assert(length - offset >= Vector128<ushort>.Count);
-                    if (((int)Unsafe.AsPointer(ref Unsafe.Add(ref searchSpace, (IntPtr)offset)) & (int)(Vector256<byte>.Count - 1)) != 0)
-                    {
-                        // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches
-                        // with no upper bound e.g. String.wcslen. Start with a check on Vector128 to align to Vector256, 
-                        // before moving to processing Vector256.
-
-                        // If the input searchSpan has been fixed or pinned, this ensures we do not fault across memory pages 
-                        // while searching for an end of string. Specifically that this assumes that the length is either correct 
-                        // or that the data is pinned otherwise it may cause an AccessViolation from crossing a page boundary into an 
-                        // unowned page. If the search is unbounded (e.g. null terminator in wcslen) and the search value is not found,
-                        // again this will likely cause an AccessViolation. However, correctly bounded searches will return -1 rather 
-                        // than ever causing an AV.
-
-                        // If the searchSpan has not been fixed or pinned the GC can relocate it during the execution of this 
-                        // method, so the alignment only acts as best endeavour. The GC cost is likely to dominate over
-                        // the misalignment that may occur after; to we default to giving the GC a free hand to relocate and 
-                        // its up to the caller whether they are operating over fixed data.
-                        Vector128<ushort> values = Vector128.Create((ushort)value);
-                        Vector128<ushort> search = LoadVector128(ref searchSpace, offset);
+                    length--;
 
-                        // Same method as below
-                        int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte());
-                        if (0u >= (uint)matches)
-                        {
-                            // Zero flags set so no matches
-                            offset += Vector128<ushort>.Count;
-                        }
-                        else
-                        {
-                            // Find bitflag offset of first match and add to current offset
-                            return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char)));
-                        }
-                    }
+                    if (pCh[0] == value)
+                        goto Found;
 
-                    lengthToExamine = GetCharVector256SpanLength(offset, length);
-                    if (lengthToExamine > 0)
-                    {
-                        Vector256<ushort> values = Vector256.Create((ushort)value);
-                        do
-                        {
-                            Debug.Assert(lengthToExamine >= Vector256<ushort>.Count);
+                    pCh++;
+                }
 
-                            Vector256<ushort> search = LoadVector256(ref searchSpace, offset);
-                            int matches = Avx2.MoveMask(Avx2.CompareEqual(values, search).AsByte());
-                            // Note that MoveMask has converted the equal vector elements into a set of bit flags,
-                            // So the bit position in 'matches' corresponds to the element offset.
-                            if (0u >= (uint)matches)
-                            {
-                                // Zero flags set so no matches
-                                offset += Vector256<ushort>.Count;
-                                lengthToExamine -= Vector256<ushort>.Count;
-                                continue;
-                            }
+                // We get past SequentialScan only if IsHardwareAccelerated is true. However, we still have the redundant check to allow
+                // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated.
+                if (Vector.IsHardwareAccelerated && pCh < pEndCh)
+                {
+                    // Get the highest multiple of Vector<ushort>.Count that is within the search space.
+                    // That will be how many times we iterate in the loop below.
+                    // This is equivalent to: length = Vector<ushort>.Count * ((int)(pEndCh - pCh) / Vector<ushort>.Count)
+                    length = (int)((pEndCh - pCh) & ~(Vector<ushort>.Count - 1));
 
-                            // Find bitflag offset of first match and add to current offset, 
-                            // flags are in bytes so divide for chars
-                            return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char)));
-                        } while (lengthToExamine > 0);
-                    }
+                    // Get comparison Vector
+                    Vector<ushort> vComparison = new Vector<ushort>(value);
 
-                    lengthToExamine = GetCharVector128SpanLength(offset, length);
-                    if (lengthToExamine > 0)
+                    while (length > 0)
                     {
-                        Debug.Assert(lengthToExamine >= Vector128<ushort>.Count);
-
-                        Vector128<ushort> values = Vector128.Create((ushort)value);
-                        Vector128<ushort> search = LoadVector128(ref searchSpace, offset);
-
-                        // Same method as above
-                        int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte());
-                        if (0u >= (uint)matches)
-                        {
-                            // Zero flags set so no matches
-                            offset += Vector128<ushort>.Count;
-                            // Don't need to change lengthToExamine here as we don't use its current value again.
-                        }
-                        else
+                        // Using Unsafe.Read instead of ReadUnaligned since the search space is pinned and pCh is always vector aligned
+                        Debug.Assert(((int)pCh & (Unsafe.SizeOf<Vector<ushort>>() - 1)) == 0);
+                        Vector<ushort> vMatches = Vector.Equals(vComparison, Unsafe.Read<Vector<ushort>>(pCh));
+                        if (Vector<ushort>.Zero.Equals(vMatches))
                         {
-                            // Find bitflag offset of first match and add to current offset, 
-                            // flags are in bytes so divide for chars
-                            return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char)));
+                            pCh += Vector<ushort>.Count;
+                            length -= Vector<ushort>.Count;
+                            continue;
                         }
+                        // Find offset of first match
+                        return (int)(pCh - pChars) + LocateFirstFoundChar(vMatches);
                     }
 
-                    if (offset < length)
-                    {
-                        lengthToExamine = length - offset;
-                        goto SequentialScan;
-                    }
-                }
-            }
-            else if (Sse2.IsSupported)
-            {
-                if (offset < length)
-                {
-                    Debug.Assert(length - offset >= Vector128<ushort>.Count);
-
-                    lengthToExamine = GetCharVector128SpanLength(offset, length);
-                    if (lengthToExamine > 0)
-                    {
-                        Vector128<ushort> values = Vector128.Create((ushort)value);
-                        do
-                        {
-                            Debug.Assert(lengthToExamine >= Vector128<ushort>.Count);
-
-                            Vector128<ushort> search = LoadVector128(ref searchSpace, offset);
-
-                            // Same method as above
-                            int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte());
-                            if (0u >= (uint)matches)
-                            {
-                                // Zero flags set so no matches
-                                offset += Vector128<ushort>.Count;
-                                lengthToExamine -= Vector128<ushort>.Count;
-                                continue;
-                            }
-
-                            // Find bitflag offset of first match and add to current offset, 
-                            // flags are in bytes so divide for chars
-                            return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char)));
-                        } while (lengthToExamine > 0);
-                    }
-
-                    if (offset < length)
+                    if (pCh < pEndCh)
                     {
-                        lengthToExamine = length - offset;
+                        length = (int)(pEndCh - pCh);
                         goto SequentialScan;
                     }
                 }
-            }
-            else if (Vector.IsHardwareAccelerated && offset < length)
-            {
-                Debug.Assert(length - offset >= Vector<ushort>.Count);
-
-                lengthToExamine = GetCharVectorSpanLength(offset, length);
-
-                if (lengthToExamine > 0)
-                {
-                    Vector<ushort> values = new Vector<ushort>((ushort)value);
-                    do
-                    {
-                        Debug.Assert(lengthToExamine >= Vector<ushort>.Count);
-
-                        var matches = Vector.Equals(values, LoadVector(ref searchSpace, offset));
-                        if (Vector<ushort>.Zero.Equals(matches))
-                        {
-                            offset += Vector<ushort>.Count;
-                            lengthToExamine -= Vector<ushort>.Count;
-                            continue;
-                        }
-
-                        // Find offset of first match
-                        return (int)(offset + LocateFirstFoundChar(matches));
-                    } while (lengthToExamine > 0);
-                }
 
-                if (offset < length)
-                {
-                    lengthToExamine = length - offset;
-                    goto SequentialScan;
-                }
+                return -1;
+            Found3:
+                pCh++;
+            Found2:
+                pCh++;
+            Found1:
+                pCh++;
+            Found:
+                return (int)(pCh - pChars);
             }
-            return -1;
-        Found3:
-            return (int)(offset + 3);
-        Found2:
-            return (int)(offset + 2);
-        Found1:
-            return (int)(offset + 1);
-        Found:
-            return (int)(offset);
-        }
 #endif
+        }
 
         #endregion
 
@@ -1769,63 +1518,39 @@ private static int LocateLastFoundChar(ulong match)
 
 #if NETCOREAPP_3_0_GREATER
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static ref char Add(ref char source, int elementOffset)
-            => ref Unsafe.Add(ref source, (IntPtr)elementOffset);
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static ref char Add(ref char source, long elementOffset)
+        public static ref char Add(ref char source, nint elementOffset)
             => ref Unsafe.Add(ref source, (IntPtr)elementOffset);
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector<ushort> LoadVector(ref char start, int offset)
-            => Unsafe.ReadUnaligned<Vector<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, (IntPtr)offset)));
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector<ushort> LoadVector(ref char start, long offset)
+        private static unsafe Vector<ushort> LoadVector(ref char start, nint offset)
             => Unsafe.ReadUnaligned<Vector<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, (IntPtr)offset)));
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector128<ushort> LoadVector128(ref char start, int offset)
-            => Unsafe.ReadUnaligned<Vector128<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, (IntPtr)offset)));
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector128<ushort> LoadVector128(ref char start, long offset)
+        private static unsafe Vector128<ushort> LoadVector128(ref char start, nint offset)
             => Unsafe.ReadUnaligned<Vector128<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, (IntPtr)offset)));
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector256<ushort> LoadVector256(ref char start, int offset)
-            => Unsafe.ReadUnaligned<Vector256<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, (IntPtr)offset)));
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector256<ushort> LoadVector256(ref char start, long offset)
+        private static unsafe Vector256<ushort> LoadVector256(ref char start, nint offset)
             => Unsafe.ReadUnaligned<Vector256<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, (IntPtr)offset)));
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe UIntPtr LoadUIntPtr(ref char start, int offset)
-            => Unsafe.ReadUnaligned<UIntPtr>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, (IntPtr)offset)));
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe UIntPtr LoadUIntPtr(ref char start, long offset)
+        private static unsafe UIntPtr LoadUIntPtr(ref char start, nint offset)
             => Unsafe.ReadUnaligned<UIntPtr>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, (IntPtr)offset)));
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe int GetCharVectorSpanLength(int offset, int length)
-            => ((length - offset) & ~(Vector<ushort>.Count - 1));
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe long GetCharVectorSpanLength(long offset, long length)
+        private static unsafe nint GetCharVectorSpanLength(nint offset, nint length)
             => ((length - offset) & ~(Vector<ushort>.Count - 1));
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe int GetCharVector128SpanLength(int offset, int length)
-            => ((length - offset) & ~(Vector128<ushort>.Count - 1));
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe long GetCharVector128SpanLength(long offset, long length)
+        private static unsafe nint GetCharVector128SpanLength(nint offset, nint length)
             => ((length - offset) & ~(Vector128<ushort>.Count - 1));
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static int GetCharVector256SpanLength(int offset, int length)
-            => ((length - offset) & ~(Vector256<ushort>.Count - 1));
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static long GetCharVector256SpanLength(long offset, long length)
+        private static nint GetCharVector256SpanLength(nint offset, nint length)
             => ((length - offset) & ~(Vector256<ushort>.Count - 1));
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe int UnalignedCountVector_x32(ref char searchSpace)
+        private static unsafe nint UnalignedCountVector(ref char searchSpace)
         {
             const int ElementsPerByte = sizeof(ushort) / sizeof(byte);
             // Figure out how many characters to read sequentially until we are vector aligned
@@ -1836,40 +1561,17 @@ private static unsafe int UnalignedCountVector_x32(ref char searchSpace)
             // This alignment is only valid if the GC does not relocate; so we use ReadUnaligned to get the data.
             // If a GC does occur and alignment is lost, the GC cost will outweigh any gains from alignment so it
             // isn't too important to pin to maintain the alignment.
-            return (int)(uint)(-(int)Unsafe.AsPointer(ref searchSpace) / ElementsPerByte) & (Vector<ushort>.Count - 1);
+            return (nint)(uint)(-(int)Unsafe.AsPointer(ref searchSpace) / ElementsPerByte) & (Vector<ushort>.Count - 1);
         }
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe long UnalignedCountVector_x64(ref char searchSpace)
-        {
-            const int ElementsPerByte = sizeof(ushort) / sizeof(byte);
-            // Figure out how many characters to read sequentially until we are vector aligned
-            // This is equivalent to:
-            //         unaligned = ((int)pCh % Unsafe.SizeOf<Vector<ushort>>()) / ElementsPerByte 
-            //         length = (Vector<ushort>.Count - unaligned) % Vector<ushort>.Count
 
-            // This alignment is only valid if the GC does not relocate; so we use ReadUnaligned to get the data.
-            // If a GC does occur and alignment is lost, the GC cost will outweigh any gains from alignment so it
-            // isn't too important to pin to maintain the alignment.
-            return (long)(uint)(-(int)Unsafe.AsPointer(ref searchSpace) / ElementsPerByte) & (Vector<ushort>.Count - 1);
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe int UnalignedCountVector128_x32(ref char searchSpace)
-        {
-            const int ElementsPerByte = sizeof(ushort) / sizeof(byte);
-            // This alignment is only valid if the GC does not relocate; so we use ReadUnaligned to get the data.
-            // If a GC does occur and alignment is lost, the GC cost will outweigh any gains from alignment so it
-            // isn't too important to pin to maintain the alignment.
-            return (int)(uint)(-(int)Unsafe.AsPointer(ref searchSpace) / ElementsPerByte) & (Vector128<ushort>.Count - 1);
-        }
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe long UnalignedCountVector128_x64(ref char searchSpace)
+        private static unsafe nint UnalignedCountVector128(ref char searchSpace)
         {
             const int ElementsPerByte = sizeof(ushort) / sizeof(byte);
             // This alignment is only valid if the GC does not relocate; so we use ReadUnaligned to get the data.
             // If a GC does occur and alignment is lost, the GC cost will outweigh any gains from alignment so it
             // isn't too important to pin to maintain the alignment.
-            return (long)(uint)(-(int)Unsafe.AsPointer(ref searchSpace) / ElementsPerByte) & (Vector128<ushort>.Count - 1);
+            return (nint)(uint)(-(int)Unsafe.AsPointer(ref searchSpace) / ElementsPerByte) & (Vector128<ushort>.Count - 1);
         }
 #endif
 
diff --git a/src/DotNetty.Common/Internal/TextEncodings.Utf16.NetCore3.cs b/src/DotNetty.Common/Internal/TextEncodings.Utf16.NetCore3.cs
index 6f115b9e0..25bf98fc1 100644
--- a/src/DotNetty.Common/Internal/TextEncodings.Utf16.NetCore3.cs
+++ b/src/DotNetty.Common/Internal/TextEncodings.Utf16.NetCore3.cs
@@ -65,14 +65,7 @@ private static unsafe int GetBytesFastInternal(char* pChars, int charsLength, by
                 char* pInputBufferRemaining;
                 byte* pOutputBufferRemaining;
 
-                if (PlatformDependent.Is64BitProcess)
-                {
-                    _ = Utf8Utility64.TranscodeToUtf8(pChars, charsLength, pBytes, bytesLength, out pInputBufferRemaining, out pOutputBufferRemaining);
-                }
-                else
-                {
-                    _ = Utf8Utility32.TranscodeToUtf8(pChars, charsLength, pBytes, bytesLength, out pInputBufferRemaining, out pOutputBufferRemaining);
-                }
+                _ = Utf8Utility.TranscodeToUtf8(pChars, charsLength, pBytes, bytesLength, out pInputBufferRemaining, out pOutputBufferRemaining);
 
                 charsConsumed = (int)(pInputBufferRemaining - pChars);
                 return (int)(pOutputBufferRemaining - pBytes);
diff --git a/src/DotNetty.Common/Internal/TextEncodings.Utf8.NetCore3.cs b/src/DotNetty.Common/Internal/TextEncodings.Utf8.NetCore3.cs
index 9d3bb06aa..c0cda71ca 100644
--- a/src/DotNetty.Common/Internal/TextEncodings.Utf8.NetCore3.cs
+++ b/src/DotNetty.Common/Internal/TextEncodings.Utf8.NetCore3.cs
@@ -27,9 +27,7 @@ private static unsafe int GetCharCountFastInternal(byte* pBytes, int bytesLength
                 // The number of UTF-16 code units will never exceed the number of UTF-8 code units,
                 // so the addition at the end of this method will not overflow.
 
-                byte* ptrToFirstInvalidByte = PlatformDependent.Is64BitProcess
-                    ? Utf8Utility64.GetPointerToFirstInvalidByte(pBytes, bytesLength, out int utf16CodeUnitCountAdjustment, out _)
-                    : Utf8Utility32.GetPointerToFirstInvalidByte(pBytes, bytesLength, out utf16CodeUnitCountAdjustment, out _);
+                byte* ptrToFirstInvalidByte = Utf8Utility.GetPointerToFirstInvalidByte(pBytes, bytesLength, out int utf16CodeUnitCountAdjustment, out _);
 
                 int tempBytesConsumed = (int)(ptrToFirstInvalidByte - pBytes);
                 bytesConsumed = tempBytesConsumed;
@@ -69,14 +67,7 @@ private static unsafe int GetCharsFastInternal(byte* pBytes, int bytesLength, ch
                 byte* pInputBufferRemaining;
                 char* pOutputBufferRemaining;
 
-                if (PlatformDependent.Is64BitProcess)
-                {
-                    _ = Utf8Utility64.TranscodeToUtf16(pBytes, bytesLength, pChars, charsLength, out pInputBufferRemaining, out pOutputBufferRemaining);
-                }
-                else
-                {
-                    _ = Utf8Utility32.TranscodeToUtf16(pBytes, bytesLength, pChars, charsLength, out pInputBufferRemaining, out pOutputBufferRemaining);
-                }
+                _ = Utf8Utility.TranscodeToUtf16(pBytes, bytesLength, pChars, charsLength, out pInputBufferRemaining, out pOutputBufferRemaining);
 
                 bytesConsumed = (int)(pInputBufferRemaining - pBytes);
                 return (int)(pOutputBufferRemaining - pChars);
@@ -108,9 +99,7 @@ private static unsafe int GetByteCountFastInternal(char* pChars, int charsLength
                 // The number of UTF-8 code units may exceed the number of UTF-16 code units,
                 // so we'll need to check for overflow before casting to Int32.
 
-                char* ptrToFirstInvalidChar = PlatformDependent.Is64BitProcess
-                    ? Utf16Utility64.GetPointerToFirstInvalidChar(pChars, charsLength, out long utf8CodeUnitCountAdjustment, out _)
-                    : Utf16Utility32.GetPointerToFirstInvalidChar(pChars, charsLength, out utf8CodeUnitCountAdjustment, out _);
+                char* ptrToFirstInvalidChar = Utf16Utility.GetPointerToFirstInvalidChar(pChars, charsLength, out long utf8CodeUnitCountAdjustment, out _);
 
                 int tempCharsConsumed = (int)(ptrToFirstInvalidChar - pChars);
                 charsConsumed = tempCharsConsumed;
diff --git a/src/DotNetty.Common/Internal/Utf16Utility32.Validation.cs b/src/DotNetty.Common/Internal/Utf16Utility.Validation.cs
similarity index 94%
rename from src/DotNetty.Common/Internal/Utf16Utility32.Validation.cs
rename to src/DotNetty.Common/Internal/Utf16Utility.Validation.cs
index 572dec4b5..c4f438c30 100644
--- a/src/DotNetty.Common/Internal/Utf16Utility32.Validation.cs
+++ b/src/DotNetty.Common/Internal/Utf16Utility.Validation.cs
@@ -11,21 +11,13 @@
 using System.Runtime.Intrinsics.X86;
 using System.Numerics;
 using System.Runtime.CompilerServices;
-using nint = System.Int32;
-using nuint = System.UInt32;
+using nuint_64 = System.UInt64;
+using nuint_32 = System.UInt32;
 
 namespace DotNetty.Common.Internal
 {
-    internal static unsafe class Utf16Utility32
+    internal static unsafe partial class Utf16Utility
     {
-#if DEBUG
-        static Utf16Utility32()
-        {
-            Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
-            Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
-        }
-#endif // DEBUG
-
         // Returns &inputBuffer[inputLength] if the input buffer is valid.
         /// <summary>
         /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>,
@@ -42,7 +34,7 @@ static Utf16Utility32()
             // First, we'll handle the common case of all-ASCII. If this is able to
             // consume the entire buffer, we'll skip the remainder of this method's logic.
 
-            int numAsciiCharsConsumedJustNow = (int)ASCIIUtility32.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength);
+            int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength);
             Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength);
 
             pInputBuffer += (uint)numAsciiCharsConsumedJustNow;
@@ -280,15 +272,30 @@ static Utf16Utility32()
                         Vector<ushort> utf16Data = Unsafe.ReadUnaligned<Vector<ushort>>(pInputBuffer);
                         Vector<ushort> twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080);
                         Vector<ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800);
-                        Vector<nuint> sumVector = (Vector<nuint>)(Vector<ushort>.Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes);
+                        nuint popcnt = 0;
+                        if (PlatformDependent.Is64BitProcess)
+                        {
+                            Vector<nuint_64> sumVector = (Vector<nuint_64>)(Vector<ushort>.Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes);
 
-                        // We'll try summing by a natural word (rather than a 16-bit word) at a time,
-                        // which should halve the number of operations we must perform.
+                            // We'll try summing by a natural word (rather than a 16-bit word) at a time,
+                            // which should halve the number of operations we must perform.
 
-                        nuint popcnt = 0;
-                        for (int i = 0; i < Vector<nuint>.Count; i++)
+                            for (int i = 0; i < Vector<nuint_64>.Count; i++)
+                            {
+                                popcnt += (nuint)sumVector[i];
+                            }
+                        }
+                        else
                         {
-                            popcnt += sumVector[i];
+                            Vector<nuint_32> sumVector = (Vector<nuint_32>)(Vector<ushort>.Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes);
+
+                            // We'll try summing by a natural word (rather than a 16-bit word) at a time,
+                            // which should halve the number of operations we must perform.
+
+                            for (int i = 0; i < Vector<nuint_32>.Count; i++)
+                            {
+                                popcnt += (nuint)sumVector[i];
+                            }
                         }
 
                         uint popcnt32 = (uint)popcnt;
diff --git a/src/DotNetty.Common/Internal/Utf16Utility.cs b/src/DotNetty.Common/Internal/Utf16Utility.cs
index ef5341bbd..7c7d93e97 100644
--- a/src/DotNetty.Common/Internal/Utf16Utility.cs
+++ b/src/DotNetty.Common/Internal/Utf16Utility.cs
@@ -10,7 +10,7 @@
 
 namespace DotNetty.Common.Internal
 {
-    internal static class Utf16Utility
+    internal static partial class Utf16Utility
     {
         /// <summary>
         /// Returns true iff the UInt32 represents two ASCII UTF-16 characters in machine endianness.
diff --git a/src/DotNetty.Common/Internal/Utf16Utility64.Validation.cs b/src/DotNetty.Common/Internal/Utf16Utility64.Validation.cs
deleted file mode 100644
index b75a4d4a8..000000000
--- a/src/DotNetty.Common/Internal/Utf16Utility64.Validation.cs
+++ /dev/null
@@ -1,433 +0,0 @@
-﻿// borrowed from https://github.com/dotnet/corefx/tree/release/3.1/src/Common/src/CoreLib/System/Text/Unicode
-
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-#if NETCOREAPP_3_0_GREATER
-using System;
-using System.Diagnostics;
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-
-using nint = System.Int64;
-using nuint = System.UInt64;
-
-namespace DotNetty.Common.Internal
-{
-    internal static unsafe class Utf16Utility64
-    {
-#if DEBUG
-        static Utf16Utility64()
-        {
-            Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
-            Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
-        }
-#endif // DEBUG
-
-        // Returns &inputBuffer[inputLength] if the input buffer is valid.
-        /// <summary>
-        /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>,
-        /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>.
-        /// </summary>
-        /// <remarks>
-        /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed.
-        /// </remarks>
-        public static char* GetPointerToFirstInvalidChar(char* pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment)
-        {
-            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
-            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
-
-            // First, we'll handle the common case of all-ASCII. If this is able to
-            // consume the entire buffer, we'll skip the remainder of this method's logic.
-
-            int numAsciiCharsConsumedJustNow = (int)ASCIIUtility64.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength);
-            Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength);
-
-            pInputBuffer += (uint)numAsciiCharsConsumedJustNow;
-            inputLength -= numAsciiCharsConsumedJustNow;
-
-            if (0u >= (uint)inputLength)
-            {
-                utf8CodeUnitCountAdjustment = 0;
-                scalarCountAdjustment = 0;
-                return pInputBuffer;
-            }
-
-            // If we got here, it means we saw some non-ASCII data, so within our
-            // vectorized code paths below we'll handle all non-surrogate UTF-16
-            // code points branchlessly. We'll only branch if we see surrogates.
-            // 
-            // We still optimistically assume the data is mostly ASCII. This means that the
-            // number of UTF-8 code units and the number of scalars almost matches the number
-            // of UTF-16 code units. As we go through the input and find non-ASCII
-            // characters, we'll keep track of these "adjustment" fixups. To get the
-            // total number of UTF-8 code units required to encode the input data, add
-            // the UTF-8 code unit count adjustment to the number of UTF-16 code units
-            // seen.  To get the total number of scalars present in the input data,
-            // add the scalar count adjustment to the number of UTF-16 code units seen.
-
-            long tempUtf8CodeUnitCountAdjustment = 0;
-            int tempScalarCountAdjustment = 0;
-
-            if (Sse2.IsSupported)
-            {
-                if (inputLength >= Vector128<ushort>.Count)
-                {
-                    Vector128<ushort> vector0080 = Vector128.Create((ushort)0x80);
-                    Vector128<ushort> vectorA800 = Vector128.Create((ushort)0xA800);
-                    Vector128<short> vector8800 = Vector128.Create(unchecked((short)0x8800));
-                    Vector128<ushort> vectorZero = Vector128<ushort>.Zero;
-
-                    do
-                    {
-                        Vector128<ushort> utf16Data = Sse2.LoadVector128((ushort*)pInputBuffer); // unaligned
-                        uint mask;
-
-                        // The 'charIsNonAscii' vector we're about to build will have the 0x8000 or the 0x0080
-                        // bit set (but not both!) only if the corresponding input char is non-ASCII. Which of
-                        // the two bits is set doesn't matter, as will be explained in the diagram a few lines
-                        // below.
-
-                        Vector128<ushort> charIsNonAscii;
-                        if (Sse41.IsSupported)
-                        {
-                            // sets 0x0080 bit if corresponding char element is >= 0x0080
-                            charIsNonAscii = Sse41.Min(utf16Data, vector0080);
-                        }
-                        else
-                        {
-                            // sets 0x8000 bit if corresponding char element is >= 0x0080
-                            charIsNonAscii = Sse2.AndNot(vector0080, Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 7)));
-                        }
-
-#if DEBUG
-                        // Quick check to ensure we didn't accidentally set both 0x8080 bits in any element.
-                        uint debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte());
-                        Debug.Assert((debugMask & (debugMask << 1)) == 0, "Two set bits shouldn't occur adjacent to each other in this mask.");
-#endif // DEBUG
-
-                        // sets 0x8080 bits if corresponding char element is >= 0x0800
-                        Vector128<ushort> charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11));
-
-                        mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte());
-
-                        // Each odd bit of mask will be 1 only if the char was >= 0x0080,
-                        // and each even bit of mask will be 1 only if the char was >= 0x0800.
-                        //
-                        // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...":
-                        //
-                        //            ,-- set if char[1] is non-ASCII
-                        //            |   ,-- set if char[0] is non-ASCII
-                        //            v   v
-                        // mask = ... 1 1 1 0
-                        //              ^   ^-- set if char[0] is >= 0x0800
-                        //              `-- set if char[1] is >= 0x0800
-                        //
-                        // (If the SSE4.1 code path is taken above, the meaning of the odd and even
-                        // bits are swapped, but the logic below otherwise holds.)
-                        //
-                        // This means we can popcnt the number of set bits, and the result is the
-                        // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as
-                        // it expands. This results in the wrong count for UTF-16 surrogate code
-                        // units (we just counted that each individual code unit expands to 3 bytes,
-                        // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes).
-                        // We'll handle this in just a moment.
-                        //
-                        // For now, compute the popcnt but squirrel it away. We'll fold it in to the
-                        // cumulative UTF-8 adjustment factor once we determine that there are no
-                        // unpaired surrogates in our data. (Unpaired surrogates would invalidate
-                        // our computed result and we'd have to throw it away.)
-
-                        uint popcnt = (uint)BitOperations.PopCount(mask);
-
-                        // Surrogates need to be special-cased for two reasons: (a) we need
-                        // to account for the fact that we over-counted in the addition above;
-                        // and (b) they require separate validation.
-
-                        utf16Data = Sse2.Add(utf16Data, vectorA800);
-                        mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());
-
-                        if (mask != 0)
-                        {
-                            // There's at least one UTF-16 surrogate code unit present.
-                            // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw,
-                            // the resulting bits of 'mask' will occur in pairs:
-                            // - 00 if the corresponding UTF-16 char was not a surrogate code unit;
-                            // - 11 if the corresponding UTF-16 char was a surrogate code unit.
-                            //
-                            // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ],
-                            // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents
-                            // a low surrogate. Since we added 0xA800 in the vectorized operation above,
-                            // our surrogate pairs will now have the bit pattern [ 10000q## ######## ].
-                            // If we logical right-shift each word by 3, we'll end up with the bit pattern
-                            // [ 00010000 q####### ], which means that we can immediately use pmovmskb to
-                            // determine whether a given char was a high or a low surrogate.
-                            //
-                            // Therefore the resulting bits of 'mask2' will occur in pairs:
-                            // - 00 if the corresponding UTF-16 char was a high surrogate code unit;
-                            // - 01 if the corresponding UTF-16 char was a low surrogate code unit;
-                            // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit.
-                            //   Since 'mask' already has 00 in these positions (since the corresponding char
-                            //   wasn't a surrogate), "mask AND mask2 == 00" holds for these positions.
-
-                            uint mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte());
-
-                            // 'lowSurrogatesMask' has its bits occur in pairs:
-                            // - 01 if the corresponding char was a low surrogate char,
-                            // - 00 if the corresponding char was a high surrogate char or not a surrogate at all.
-
-                            uint lowSurrogatesMask = mask2 & mask;
-
-                            // 'highSurrogatesMask' has its bits occur in pairs:
-                            // - 01 if the corresponding char was a high surrogate char,
-                            // - 00 if the corresponding char was a low surrogate char or not a surrogate at all.
-
-                            uint highSurrogatesMask = (mask2 ^ 0b_0101_0101_0101_0101u /* flip all even-numbered bits 00 <-> 01 */) & mask;
-
-                            Debug.Assert((highSurrogatesMask & lowSurrogatesMask) == 0,
-                                "A char cannot simultaneously be both a high and a low surrogate char.");
-
-                            Debug.Assert(((highSurrogatesMask | lowSurrogatesMask) & 0b_1010_1010_1010_1010u) == 0,
-                                "Only even bits (no odd bits) of the masks should be set.");
-
-                            // Now check that each high surrogate is followed by a low surrogate and that each
-                            // low surrogate follows a high surrogate. We make an exception for the case where
-                            // the final char of the vector is a high surrogate, since we can't perform validation
-                            // on it until the next iteration of the loop when we hope to consume the matching
-                            // low surrogate.
-
-                            highSurrogatesMask <<= 2;
-                            if ((ushort)highSurrogatesMask != lowSurrogatesMask)
-                            {
-                                goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
-                            }
-
-                            if (highSurrogatesMask > ushort.MaxValue)
-                            {
-                                // There was a standalone high surrogate at the end of the vector.
-                                // We'll adjust our counters so that we don't consider this char consumed.
-
-                                highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt
-                                popcnt -= 2; // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here
-                                pInputBuffer--;
-                                inputLength++;
-                            }
-
-                            // If we're 64-bit, we can perform the zero-extension of the surrogate pairs count for
-                            // free right now, saving the extension step a few lines below. If we're 32-bit, the
-                            // convertion to nuint immediately below is a no-op, and we'll pay the cost of the real
-                            // 64 -bit extension a few lines below.
-                            nuint surrogatePairsCountNuint = (uint)BitOperations.PopCount(highSurrogatesMask);
-
-                            // 2 UTF-16 chars become 1 Unicode scalar
-
-                            tempScalarCountAdjustment -= (int)surrogatePairsCountNuint;
-
-                            // Since each surrogate code unit was >= 0x0800, we eagerly assumed
-                            // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation
-                            // assumes that the pair is encoded as 6 UTF-8 code units. Since each
-                            // pair is in reality only encoded as 4 UTF-8 code units, we need to
-                            // perform this adjustment now.
-
-                            if (PlatformDependent.Is64BitProcess)
-                            {
-                                // Since we've already zero-extended surrogatePairsCountNuint, we can directly
-                                // sub + sub. It's more efficient than shl + sub.
-                                tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint;
-                                tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint;
-                            }
-                            else
-                            {
-                                // Take the hit of the 64-bit extension now.
-                                tempUtf8CodeUnitCountAdjustment -= 2 * (uint)surrogatePairsCountNuint;
-                            }
-                        }
-
-                        tempUtf8CodeUnitCountAdjustment += popcnt;
-                        pInputBuffer += Vector128<ushort>.Count;
-                        inputLength -= Vector128<ushort>.Count;
-                    } while (inputLength >= Vector128<ushort>.Count);
-                }
-            }
-            else if (Vector.IsHardwareAccelerated)
-            {
-                if (inputLength >= Vector<ushort>.Count)
-                {
-                    Vector<ushort> vector0080 = new Vector<ushort>(0x0080);
-                    Vector<ushort> vector0400 = new Vector<ushort>(0x0400);
-                    Vector<ushort> vector0800 = new Vector<ushort>(0x0800);
-                    Vector<ushort> vectorD800 = new Vector<ushort>(0xD800);
-
-                    do
-                    {
-                        // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain
-                        // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding
-                        // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these
-                        // vectors, each element of the sum will contain one of three values:
-                        //
-                        // 0x0000 ( 0) = original char was 0000..007F
-                        // 0xFFFF (-1) = original char was 0080..07FF
-                        // 0xFFFE (-2) = original char was 0800..FFFF
-                        //
-                        // We'll negate them to produce a value 0..2 for each element, then sum all the
-                        // elements together to produce the number of *additional* UTF-8 code units
-                        // required to represent this UTF-16 data. This is similar to the popcnt step
-                        // performed by the SSE2 code path. This will overcount surrogates, but we'll
-                        // handle that shortly.
-
-                        Vector<ushort> utf16Data = Unsafe.ReadUnaligned<Vector<ushort>>(pInputBuffer);
-                        Vector<ushort> twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080);
-                        Vector<ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800);
-                        Vector<nuint> sumVector = (Vector<nuint>)(Vector<ushort>.Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes);
-
-                        // We'll try summing by a natural word (rather than a 16-bit word) at a time,
-                        // which should halve the number of operations we must perform.
-
-                        nuint popcnt = 0;
-                        for (int i = 0; i < Vector<nuint>.Count; i++)
-                        {
-                            popcnt += sumVector[i];
-                        }
-
-                        uint popcnt32 = (uint)popcnt;
-                        if (PlatformDependent.Is64BitProcess)
-                        {
-                            popcnt32 += (uint)(popcnt >> 32);
-                        }
-
-                        // As in the SSE4.1 paths, compute popcnt but don't fold it in until we
-                        // know there aren't any unpaired surrogates in the input data.
-
-                        popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16);
-
-                        // Now check for surrogates.
-
-                        utf16Data -= vectorD800;
-                        Vector<ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800);
-                        if (surrogateChars != Vector<ushort>.Zero)
-                        {
-                            // There's at least one surrogate (high or low) UTF-16 code unit in
-                            // the vector. We'll build up additional vectors: 'highSurrogateChars'
-                            // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original
-                            // UTF-16 code unit was a high or low surrogate, respectively.
-
-                            Vector<ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400);
-                            Vector<ushort> lowSurrogateChars = Vector.AndNot(surrogateChars, highSurrogateChars);
-
-                            // We want to make sure that each high surrogate code unit is followed by
-                            // a low surrogate code unit and each low surrogate code unit follows a
-                            // high surrogate code unit. Since we don't have an equivalent of pmovmskb
-                            // or palignr available to us, we'll do this as a loop. We won't look at
-                            // the very last high surrogate char element since we don't yet know if
-                            // the next vector read will have a low surrogate char element.
-
-                            if (lowSurrogateChars[0] != 0)
-                            {
-                                goto Error; // error: start of buffer contains standalone low surrogate char
-                            }
-
-                            ushort surrogatePairsCount = 0;
-                            for (int i = 0; i < Vector<ushort>.Count - 1; i++)
-                            {
-                                surrogatePairsCount -= highSurrogateChars[i]; // turns into +1 or +0
-                                if (highSurrogateChars[i] != lowSurrogateChars[i + 1])
-                                {
-                                    goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
-                                }
-                            }
-
-                            if (highSurrogateChars[Vector<ushort>.Count - 1] != 0)
-                            {
-                                // There was a standalone high surrogate at the end of the vector.
-                                // We'll adjust our counters so that we don't consider this char consumed.
-
-                                pInputBuffer--;
-                                inputLength++;
-                                popcnt32 -= 2;
-                            }
-
-                            nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size
-
-                            // 2 UTF-16 chars become 1 Unicode scalar
-
-                            tempScalarCountAdjustment -= (int)surrogatePairsCountNint;
-
-                            // Since each surrogate code unit was >= 0x0800, we eagerly assumed
-                            // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only
-                            // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total),
-                            // so we'll adjust this now.
-
-                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
-                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
-                        }
-
-                        tempUtf8CodeUnitCountAdjustment += popcnt32;
-                        pInputBuffer += Vector<ushort>.Count;
-                        inputLength -= Vector<ushort>.Count;
-                    } while (inputLength >= Vector<ushort>.Count);
-                }
-            }
-
-        NonVectorizedLoop:
-
-            // Vectorization isn't supported on our current platform, or the input was too small to benefit
-            // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to
-            // drain remaining valid chars before we report failure.
-
-            for (; inputLength > 0; pInputBuffer++, inputLength--)
-            {
-                uint thisChar = pInputBuffer[0];
-                if (thisChar <= 0x7F)
-                {
-                    continue;
-                }
-
-                // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF.
-                // This optimistically assumes no surrogates, which we'll handle shortly.
-
-                tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16;
-
-                if (!UnicodeUtility.IsSurrogateCodePoint(thisChar))
-                {
-                    continue;
-                }
-
-                // Found a surrogate char. Back out the adjustment we made above, then
-                // try to consume the entire surrogate pair all at once. We won't bother
-                // trying to interpret the surrogate pair as a scalar value; we'll only
-                // validate that its bit pattern matches what's expected for a surrogate pair.
-
-                tempUtf8CodeUnitCountAdjustment -= 2;
-
-                if (inputLength == 1)
-                {
-                    goto Error; // input buffer too small to read a surrogate pair
-                }
-
-                thisChar = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-                if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0)
-                {
-                    goto Error; // not a well-formed surrogate pair
-                }
-
-                tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar
-                tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units
-
-                pInputBuffer++; // consumed one extra char
-                inputLength--;
-            }
-
-        Error:
-
-            // Also used for normal return.
-
-            utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment;
-            scalarCountAdjustment = tempScalarCountAdjustment;
-            return pInputBuffer;
-        }
-    }
-}
-#endif
diff --git a/src/DotNetty.Common/Internal/Utf8Utility64.Transcoding.cs b/src/DotNetty.Common/Internal/Utf8Utility.Transcoding.cs
similarity index 98%
rename from src/DotNetty.Common/Internal/Utf8Utility64.Transcoding.cs
rename to src/DotNetty.Common/Internal/Utf8Utility.Transcoding.cs
index 1cfa94388..65670a61b 100644
--- a/src/DotNetty.Common/Internal/Utf8Utility64.Transcoding.cs
+++ b/src/DotNetty.Common/Internal/Utf8Utility.Transcoding.cs
@@ -12,23 +12,11 @@
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics.X86;
-using nint = System.Int64;
-using nuint = System.UInt64;
 
 namespace DotNetty.Common.Internal
 {
-    internal static unsafe partial class Utf8Utility64
+    internal static unsafe partial class Utf8Utility
     {
-#if DEBUG
-        static Utf8Utility64()
-        {
-            Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
-            Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
-
-            _ValidateAdditionalNIntDefinitions();
-        }
-#endif // DEBUG
-
         // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where
         // the next byte would have been consumed from / the next char would have been written to.
         // inputLength in bytes, outputCharsRemaining in chars.
@@ -43,7 +31,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
             // First, try vectorized conversion.
 
             {
-                nuint numElementsConverted = ASCIIUtility64.WidenAsciiToUtf16(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputCharsRemaining));
+                nuint numElementsConverted = ASCIIUtility.WidenAsciiToUtf16(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputCharsRemaining));
 
                 pInputBuffer += numElementsConverted;
                 pOutputBuffer += numElementsConverted;
@@ -871,7 +859,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
             // First, try vectorized conversion.
 
             {
-                nuint numElementsConverted = ASCIIUtility64.NarrowUtf16ToAscii(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputBytesRemaining));
+                nuint numElementsConverted = ASCIIUtility.NarrowUtf16ToAscii(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputBytesRemaining));
 
                 pInputBuffer += numElementsConverted;
                 pOutputBuffer += numElementsConverted;
diff --git a/src/DotNetty.Common/Internal/Utf8Utility64.Validation.cs b/src/DotNetty.Common/Internal/Utf8Utility.Validation.cs
similarity index 98%
rename from src/DotNetty.Common/Internal/Utf8Utility64.Validation.cs
rename to src/DotNetty.Common/Internal/Utf8Utility.Validation.cs
index b6789a205..33b0a4e86 100644
--- a/src/DotNetty.Common/Internal/Utf8Utility64.Validation.cs
+++ b/src/DotNetty.Common/Internal/Utf8Utility.Validation.cs
@@ -10,21 +10,11 @@
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics.X86;
-using nint = System.Int64;
-using nuint = System.UInt64;
 
 namespace DotNetty.Common.Internal
 {
-    internal static unsafe partial class Utf8Utility64
+    internal static unsafe partial class Utf8Utility
     {
-#if DEBUG
-        private static void _ValidateAdditionalNIntDefinitions()
-        {
-            Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
-            Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
-        }
-#endif // DEBUG
-
         // Returns &inputBuffer[inputLength] if the input buffer is valid.
         /// <summary>
         /// Given an input buffer <paramref name="pInputBuffer"/> of byte length <paramref name="inputLength"/>,
@@ -41,7 +31,7 @@ private static void _ValidateAdditionalNIntDefinitions()
             // First, try to drain off as many ASCII bytes as we can from the beginning.
 
             {
-                nuint numAsciiBytesCounted = ASCIIUtility64.GetIndexOfFirstNonAsciiByte(pInputBuffer, (uint)inputLength);
+                nuint numAsciiBytesCounted = ASCIIUtility.GetIndexOfFirstNonAsciiByte(pInputBuffer, (uint)inputLength);
                 pInputBuffer += numAsciiBytesCounted;
 
                 // Quick check - did we just end up consuming the entire input buffer?
diff --git a/src/DotNetty.Common/Internal/Utf8Utility.cs b/src/DotNetty.Common/Internal/Utf8Utility.cs
index ef81623ae..e7febc58b 100644
--- a/src/DotNetty.Common/Internal/Utf8Utility.cs
+++ b/src/DotNetty.Common/Internal/Utf8Utility.cs
@@ -41,9 +41,7 @@ public unsafe static int GetIndexOfFirstInvalidUtf8Sequence(in ReadOnlySpan<byte
         {
             fixed (byte* pUtf8Data = &MemoryMarshal.GetReference(utf8Data))
             {
-                byte* pFirstInvalidByte = PlatformDependent.Is64BitProcess
-                    ? Utf8Utility64.GetPointerToFirstInvalidByte(pUtf8Data, utf8Data.Length, out int utf16CodeUnitCountAdjustment, out _)
-                    : Utf8Utility32.GetPointerToFirstInvalidByte(pUtf8Data, utf8Data.Length, out utf16CodeUnitCountAdjustment, out _);
+                byte* pFirstInvalidByte = GetPointerToFirstInvalidByte(pUtf8Data, utf8Data.Length, out int utf16CodeUnitCountAdjustment, out _);
                 int index = (int)(void*)Unsafe.ByteOffset(ref *pUtf8Data, ref *pFirstInvalidByte);
 
                 isAscii = (0u >= (uint)utf16CodeUnitCountAdjustment); // If UTF-16 char count == UTF-8 byte count, it's ASCII.
diff --git a/src/DotNetty.Common/Internal/Utf8Utility32.Transcoding.cs b/src/DotNetty.Common/Internal/Utf8Utility32.Transcoding.cs
deleted file mode 100644
index 077eaac51..000000000
--- a/src/DotNetty.Common/Internal/Utf8Utility32.Transcoding.cs
+++ /dev/null
@@ -1,1477 +0,0 @@
-﻿// borrowed from https://github.com/dotnet/corefx/tree/release/3.1/src/Common/src/CoreLib/System/Text/Unicode
-
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-#if NETCOREAPP_3_0_GREATER
-using System;
-using System.Buffers;
-using System.Buffers.Binary;
-using System.Diagnostics;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.Intrinsics.X86;
-using nint = System.Int32;
-using nuint = System.UInt32;
-
-namespace DotNetty.Common.Internal
-{
-    internal static unsafe partial class Utf8Utility32
-    {
-#if DEBUG
-        static Utf8Utility32()
-        {
-            Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
-            Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
-
-            _ValidateAdditionalNIntDefinitions();
-        }
-#endif // DEBUG
-
-        // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where
-        // the next byte would have been consumed from / the next char would have been written to.
-        // inputLength in bytes, outputCharsRemaining in chars.
-        public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLength, char* pOutputBuffer, int outputCharsRemaining, out byte* pInputBufferRemaining, out char* pOutputBufferRemaining)
-        {
-            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
-            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
-
-            Debug.Assert(outputCharsRemaining >= 0, "Destination length must not be negative.");
-            Debug.Assert(pOutputBuffer != null || outputCharsRemaining == 0, "Destination length must be zero if destination buffer pointer is null.");
-
-            // First, try vectorized conversion.
-
-            {
-                nuint numElementsConverted = ASCIIUtility32.WidenAsciiToUtf16(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputCharsRemaining));
-
-                pInputBuffer += numElementsConverted;
-                pOutputBuffer += numElementsConverted;
-
-                // Quick check - did we just end up consuming the entire input buffer?
-                // If so, short-circuit the remainder of the method.
-
-                if ((int)numElementsConverted == inputLength)
-                {
-                    pInputBufferRemaining = pInputBuffer;
-                    pOutputBufferRemaining = pOutputBuffer;
-                    return OperationStatus.Done;
-                }
-
-                inputLength -= (int)numElementsConverted;
-                outputCharsRemaining -= (int)numElementsConverted;
-            }
-
-            if (inputLength < sizeof(uint))
-            {
-                goto ProcessInputOfLessThanDWordSize;
-            }
-
-            byte* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - 4;
-
-            // Begin the main loop.
-
-#if DEBUG
-            byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds
-#endif
-
-            while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
-            {
-                // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar.
-
-                uint thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-
-            AfterReadDWord:
-
-#if DEBUG
-                Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read.");
-                pLastBufferPosProcessed = pInputBuffer;
-#endif
-                // First, check for the common case of all-ASCII bytes.
-
-                if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
-                {
-                    // We read an all-ASCII sequence.
-
-                    if (outputCharsRemaining < sizeof(uint))
-                    {
-                        goto ProcessRemainingBytesSlow; // running out of space, but may be able to write some data
-                    }
-
-                    Utf8Utility.Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
-                    pInputBuffer += 4;
-                    pOutputBuffer += 4;
-                    outputCharsRemaining -= 4;
-
-                    // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII.
-                    // Below is basically unrolled loops with poor man's vectorization.
-
-                    uint remainingInputBytes = (uint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4;
-                    uint maxIters = Math.Min(remainingInputBytes, (uint)outputCharsRemaining) / (2 * sizeof(uint));
-                    uint secondDWord;
-                    int i;
-                    for (i = 0; (uint)i < maxIters; i++)
-                    {
-                        // Reading two DWORDs in parallel benchmarked faster than reading a single QWORD.
-
-                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-                        secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + sizeof(uint));
-
-                        if (!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord | secondDWord))
-                        {
-                            goto LoopTerminatedEarlyDueToNonAsciiData;
-                        }
-
-                        pInputBuffer += 8;
-
-                        Utf8Utility.Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[0], thisDWord);
-                        Utf8Utility.Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[4], secondDWord);
-
-                        pOutputBuffer += 8;
-                    }
-
-                    outputCharsRemaining -= 8 * i;
-
-                    continue; // need to perform a bounds check because we might be running out of data
-
-                LoopTerminatedEarlyDueToNonAsciiData:
-
-                    if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
-                    {
-                        // The first DWORD contained all-ASCII bytes, so expand it.
-
-                        Utf8Utility.Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
-
-                        // continue the outer loop from the second DWORD
-
-                        Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(secondDWord));
-                        thisDWord = secondDWord;
-
-                        pInputBuffer += 4;
-                        pOutputBuffer += 4;
-                        outputCharsRemaining -= 4;
-                    }
-
-                    outputCharsRemaining -= 8 * i;
-
-                    // We know that there's *at least* one DWORD of data remaining in the buffer.
-                    // We also know that it's not all-ASCII. We can skip the logic at the beginning of the main loop.
-
-                    goto AfterReadDWordSkipAllBytesAsciiCheck;
-                }
-
-            AfterReadDWordSkipAllBytesAsciiCheck:
-
-                Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)); // this should have been handled earlier
-
-                // Next, try stripping off ASCII bytes one at a time.
-                // We only handle up to three ASCII bytes here since we handled the four ASCII byte case above.
-
-                if (Utf8Utility.UInt32FirstByteIsAscii(thisDWord))
-                {
-                    if (outputCharsRemaining >= 3)
-                    {
-                        // Fast-track: we don't need to check the destination length for subsequent
-                        // ASCII bytes since we know we can write them all now.
-
-                        uint thisDWordLittleEndian = Utf8Utility.ToLittleEndian(thisDWord);
-
-                        nuint adjustment = 1;
-                        pOutputBuffer[0] = (char)(byte)thisDWordLittleEndian;
-
-                        if (Utf8Utility.UInt32SecondByteIsAscii(thisDWord))
-                        {
-                            adjustment++;
-                            thisDWordLittleEndian >>= 8;
-                            pOutputBuffer[1] = (char)(byte)thisDWordLittleEndian;
-
-                            if (Utf8Utility.UInt32ThirdByteIsAscii(thisDWord))
-                            {
-                                adjustment++;
-                                thisDWordLittleEndian >>= 8;
-                                pOutputBuffer[2] = (char)(byte)thisDWordLittleEndian;
-                            }
-                        }
-
-                        pInputBuffer += adjustment;
-                        pOutputBuffer += adjustment;
-                        outputCharsRemaining -= (int)adjustment;
-                    }
-                    else
-                    {
-                        // Slow-track: we need to make sure each individual write has enough
-                        // of a buffer so that we don't overrun the destination.
-
-                        if (0u >= (uint)outputCharsRemaining)
-                        {
-                            goto OutputBufferTooSmall;
-                        }
-
-                        uint thisDWordLittleEndian = Utf8Utility.ToLittleEndian(thisDWord);
-
-                        pInputBuffer++;
-                        *pOutputBuffer++ = (char)(byte)thisDWordLittleEndian;
-                        outputCharsRemaining--;
-
-                        if (Utf8Utility.UInt32SecondByteIsAscii(thisDWord))
-                        {
-                            if (0u >= (uint)outputCharsRemaining)
-                            {
-                                goto OutputBufferTooSmall;
-                            }
-
-                            pInputBuffer++;
-                            thisDWordLittleEndian >>= 8;
-                            *pOutputBuffer++ = (char)(byte)thisDWordLittleEndian;
-
-                            // We can perform a small optimization here. We know at this point that
-                            // the output buffer is fully consumed (we read two ASCII bytes and wrote
-                            // two ASCII chars, and we checked earlier that the destination buffer
-                            // can't store a third byte). If the next byte is ASCII, we can jump straight
-                            // to the return statement since the end-of-method logic only relies on the
-                            // destination buffer pointer -- NOT the output chars remaining count -- being
-                            // correct. If the next byte is not ASCII, we'll need to continue with the
-                            // rest of the main loop, but we can set the buffer length directly to zero
-                            // rather than decrementing it from 1 to 0.
-
-                            Debug.Assert(outputCharsRemaining == 1);
-
-                            if (Utf8Utility.UInt32ThirdByteIsAscii(thisDWord))
-                            {
-                                goto OutputBufferTooSmall;
-                            }
-                            else
-                            {
-                                outputCharsRemaining = 0;
-                            }
-                        }
-                    }
-
-                    if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
-                    {
-                        goto ProcessRemainingBytesSlow; // input buffer doesn't contain enough data to read a DWORD
-                    }
-                    else
-                    {
-                        // The input buffer at the current offset contains a non-ASCII byte.
-                        // Read an entire DWORD and fall through to multi-byte consumption logic.
-                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-                    }
-                }
-
-            BeforeProcessTwoByteSequence:
-
-                // At this point, we know we're working with a multi-byte code unit,
-                // but we haven't yet validated it.
-
-                // The masks and comparands are derived from the Unicode Standard, Table 3-6.
-                // Additionally, we need to check for valid byte sequences per Table 3-7.
-
-                // Check the 2-byte case.
-
-                if (Utf8Utility.UInt32BeginsWithUtf8TwoByteMask(thisDWord))
-                {
-                    // Per Table 3-7, valid sequences are:
-                    // [ C2..DF ] [ 80..BF ]
-
-                    if (Utf8Utility.UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
-                    {
-                        goto Error;
-                    }
-
-                ProcessTwoByteSequenceSkipOverlongFormCheck:
-
-                    // Optimization: If this is a two-byte-per-character language like Cyrillic or Hebrew,
-                    // there's a good chance that if we see one two-byte run then there's another two-byte
-                    // run immediately after. Let's check that now.
-
-                    // On little-endian platforms, we can check for the two-byte UTF8 mask *and* validate that
-                    // the value isn't overlong using a single comparison. On big-endian platforms, we'll need
-                    // to validate the mask and validate that the sequence isn't overlong as two separate comparisons.
-
-                    if ((BitConverter.IsLittleEndian && Utf8Utility.UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
-                        || (!BitConverter.IsLittleEndian && (Utf8Utility.UInt32EndsWithUtf8TwoByteMask(thisDWord) && !Utf8Utility.UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord))))
-                    {
-                        // We have two runs of two bytes each.
-
-                        if (outputCharsRemaining < 2)
-                        {
-                            goto ProcessRemainingBytesSlow; // running out of output buffer
-                        }
-
-                        Unsafe.WriteUnaligned<uint>(pOutputBuffer, Utf8Utility.ExtractTwoCharsPackedFromTwoAdjacentTwoByteSequences(thisDWord));
-
-                        pInputBuffer += 4;
-                        pOutputBuffer += 2;
-                        outputCharsRemaining -= 2;
-
-                        if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
-                        {
-                            // Optimization: If we read a long run of two-byte sequences, the next sequence is probably
-                            // also two bytes. Check for that first before going back to the beginning of the loop.
-
-                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-
-                            if (BitConverter.IsLittleEndian)
-                            {
-                                if (Utf8Utility.UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
-                                {
-                                    // The next sequence is a valid two-byte sequence.
-                                    goto ProcessTwoByteSequenceSkipOverlongFormCheck;
-                                }
-                            }
-                            else
-                            {
-                                if (Utf8Utility.UInt32BeginsWithUtf8TwoByteMask(thisDWord))
-                                {
-                                    if (Utf8Utility.UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
-                                    {
-                                        goto Error; // The next sequence purports to be a 2-byte sequence but is overlong.
-                                    }
-
-                                    goto ProcessTwoByteSequenceSkipOverlongFormCheck;
-                                }
-                            }
-
-                            // If we reached this point, the next sequence is something other than a valid
-                            // two-byte sequence, so go back to the beginning of the loop.
-                            goto AfterReadDWord;
-                        }
-                        else
-                        {
-                            goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
-                        }
-                    }
-
-                    // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence.
-                    // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
-                    // bytes are ASCII?
-
-                    uint charToWrite = Utf8Utility.ExtractCharFromFirstTwoByteSequence(thisDWord); // optimistically compute this now, but don't store until we know dest is large enough
-
-                    if (Utf8Utility.UInt32ThirdByteIsAscii(thisDWord))
-                    {
-                        if (Utf8Utility.UInt32FourthByteIsAscii(thisDWord))
-                        {
-                            if (outputCharsRemaining < 3)
-                            {
-                                goto ProcessRemainingBytesSlow; // running out of output buffer
-                            }
-
-                            pOutputBuffer[0] = (char)charToWrite;
-                            if (BitConverter.IsLittleEndian)
-                            {
-                                thisDWord >>= 16;
-                                pOutputBuffer[1] = (char)(byte)thisDWord;
-                                thisDWord >>= 8;
-                                pOutputBuffer[2] = (char)thisDWord;
-                            }
-                            else
-                            {
-                                pOutputBuffer[2] = (char)(byte)thisDWord;
-                                pOutputBuffer[1] = (char)(byte)(thisDWord >> 8);
-                            }
-                            pInputBuffer += 4;
-                            pOutputBuffer += 3;
-                            outputCharsRemaining -= 3;
-
-                            continue; // go back to original bounds check and check for ASCII
-                        }
-                        else
-                        {
-                            if (outputCharsRemaining < 2)
-                            {
-                                goto ProcessRemainingBytesSlow; // running out of output buffer
-                            }
-
-                            pOutputBuffer[0] = (char)charToWrite;
-                            pOutputBuffer[1] = (char)(byte)(thisDWord >> (BitConverter.IsLittleEndian ? 16 : 8));
-                            pInputBuffer += 3;
-                            pOutputBuffer += 2;
-                            outputCharsRemaining -= 2;
-
-                            // A two-byte sequence followed by an ASCII byte followed by a non-ASCII byte.
-                            // Read in the next DWORD and jump directly to the start of the multi-byte processing block.
-
-                            if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer)
-                            {
-                                goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
-                            }
-                            else
-                            {
-                                thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-                                goto BeforeProcessTwoByteSequence;
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if (0u >= (uint)outputCharsRemaining)
-                        {
-                            goto ProcessRemainingBytesSlow; // running out of output buffer
-                        }
-
-                        pOutputBuffer[0] = (char)charToWrite;
-                        pInputBuffer += 2;
-                        pOutputBuffer += 1;
-                        outputCharsRemaining--;
-
-                        if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer)
-                        {
-                            goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
-                        }
-                        else
-                        {
-                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-                            goto BeforeProcessThreeByteSequence; // we know the next byte isn't ASCII, and it's not the start of a 2-byte sequence (this was checked above)
-                        }
-                    }
-                }
-
-            // Check the 3-byte case.
-
-            BeforeProcessThreeByteSequence:
-
-                if (Utf8Utility.UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
-                {
-                ProcessThreeByteSequenceWithCheck:
-
-                    // We need to check for overlong or surrogate three-byte sequences.
-                    //
-                    // Per Table 3-7, valid sequences are:
-                    // [   E0   ] [ A0..BF ] [ 80..BF ]
-                    // [ E1..EC ] [ 80..BF ] [ 80..BF ]
-                    // [   ED   ] [ 80..9F ] [ 80..BF ]
-                    // [ EE..EF ] [ 80..BF ] [ 80..BF ]
-                    //
-                    // Big-endian examples of using the above validation table:
-                    // E0A0 = 1110 0000 1010 0000 => invalid (overlong ) patterns are 1110 0000 100# ####
-                    // ED9F = 1110 1101 1001 1111 => invalid (surrogate) patterns are 1110 1101 101# ####
-                    // If using the bitmask ......................................... 0000 1111 0010 0000 (=0F20),
-                    // Then invalid (overlong) patterns match the comparand ......... 0000 0000 0000 0000 (=0000),
-                    // And invalid (surrogate) patterns match the comparand ......... 0000 1101 0010 0000 (=0D20).
-
-                    if (BitConverter.IsLittleEndian)
-                    {
-                        // The "overlong or surrogate" check can be implemented using a single jump, but there's
-                        // some overhead to moving the bits into the correct locations in order to perform the
-                        // correct comparison, and in practice the processor's branch prediction capability is
-                        // good enough that we shouldn't bother. So we'll use two jumps instead.
-
-                        // Can't extract this check into its own helper method because JITter produces suboptimal
-                        // assembly, even with aggressive inlining.
-
-                        // Code below becomes 5 instructions: test, jz, lea, test, jz
-
-                        if ((0u >= (thisDWord & 0x0000_200Fu)) || (0u >= ((thisDWord - 0x0000_200Du) & 0x0000_200Fu)))
-                        {
-                            goto Error; // overlong or surrogate
-                        }
-                    }
-                    else
-                    {
-                        if ((0u >= (thisDWord & 0x0F20_0000u)) || (0u >= ((thisDWord - 0x0D20_0000u) & 0x0F20_0000u)))
-                        {
-                            goto Error; // overlong or surrogate
-                        }
-                    }
-
-                    // At this point, we know the incoming scalar is well-formed.
-
-                    if (0u >= (uint)outputCharsRemaining)
-                    {
-                        goto OutputBufferTooSmall; // not enough space in the destination buffer to write
-                    }
-
-                    // As an optimization, on compatible platforms check if a second three-byte sequence immediately
-                    // follows the one we just read, and if so use BSWAP and BMI2 to extract them together.
-
-                    if (Bmi2.X64.IsSupported)
-                    {
-                        Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian.");
-
-                        // First, check that the leftover byte from the original DWORD is in the range [ E0..EF ], which
-                        // would indicate the potential start of a second three-byte sequence.
-
-                        if (0u >= ((thisDWord - 0xE000_0000u) & 0xF000_0000u))
-                        {
-                            // The const '3' below is correct because pFinalPosWhereCanReadDWordFromInputBuffer represents
-                            // the final place where we can safely perform a DWORD read, and we want to probe whether it's
-                            // safe to read a DWORD beginning at address &pInputBuffer[3].
-
-                            if (outputCharsRemaining > 1 && (nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 3)
-                            {
-                                // We're going to attempt to read a second 3-byte sequence and write them both out simultaneously using PEXT.
-                                // We need to check the continuation bit mask on the remaining two bytes (and we may as well check the leading
-                                // byte mask again since it's free), then perform overlong + surrogate checks. If the overlong or surrogate
-                                // checks fail, we'll fall through to the remainder of the logic which will transcode the original valid
-                                // 3-byte UTF-8 sequence we read; and on the next iteration of the loop the validation routine will run again,
-                                // fail, and redirect control flow to the error handling logic at the very end of this method.
-
-                                uint secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + 3);
-
-                                if (Utf8Utility.UInt32BeginsWithUtf8ThreeByteMask(secondDWord)
-                                    && ((secondDWord & 0x0000_200Fu) != 0)
-                                    && (((secondDWord - 0x0000_200Du) & 0x0000_200Fu) != 0))
-                                {
-                                    // combinedQWord = [ 1110ZZZZ 10YYYYYY 10XXXXXX ######## | 1110zzzz 10yyyyyy 10xxxxxx ######## ], where xyz are from first DWORD, XYZ are from second DWORD
-                                    ulong combinedQWord = ((ulong)BinaryPrimitives.ReverseEndianness(secondDWord) << 32) | BinaryPrimitives.ReverseEndianness(thisDWord);
-                                    thisDWord = secondDWord; // store this value in the correct local for the ASCII drain logic
-
-                                    // extractedQWord = [ 00000000 00000000 00000000 00000000 | ZZZZYYYYYYXXXXXX zzzzyyyyyyxxxxxx ]
-                                    ulong extractedQWord = Bmi2.X64.ParallelBitExtract(combinedQWord, 0x0F3F3F00_0F3F3F00ul);
-
-                                    Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)extractedQWord);
-                                    pInputBuffer += 6;
-                                    pOutputBuffer += 2;
-                                    outputCharsRemaining -= 2;
-
-                                    // Drain any ASCII data following the second three-byte sequence.
-
-                                    goto CheckForAsciiByteAfterThreeByteSequence;
-                                }
-                            }
-                        }
-                    }
-
-                    // Couldn't extract 2x three-byte sequences together, just do this one by itself.
-
-                    *pOutputBuffer = (char)Utf8Utility.ExtractCharFromFirstThreeByteSequence(thisDWord);
-                    pInputBuffer += 3;
-                    pOutputBuffer += 1;
-                    outputCharsRemaining -= 1;
-
-                CheckForAsciiByteAfterThreeByteSequence:
-
-                    // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way
-                    // in to the text. If this happens strip it off now before seeing if the next character
-                    // consists of three code units.
-
-                    if (Utf8Utility.UInt32FourthByteIsAscii(thisDWord))
-                    {
-                        if (0u >= (uint)outputCharsRemaining)
-                        {
-                            goto OutputBufferTooSmall;
-                        }
-
-                        if (BitConverter.IsLittleEndian)
-                        {
-                            *pOutputBuffer = (char)(thisDWord >> 24);
-                        }
-                        else
-                        {
-                            *pOutputBuffer = (char)(byte)thisDWord;
-                        }
-
-                        pInputBuffer += 1;
-                        pOutputBuffer += 1;
-                        outputCharsRemaining -= 1;
-                    }
-
-                    if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
-                    {
-                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-
-                        // Optimization: A three-byte character could indicate CJK text, which makes it likely
-                        // that the character following this one is also CJK. We'll check for a three-byte sequence
-                        // marker now and jump directly to three-byte sequence processing if we see one, skipping
-                        // all of the logic at the beginning of the loop.
-
-                        if (Utf8Utility.UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
-                        {
-                            goto ProcessThreeByteSequenceWithCheck; // found a three-byte sequence marker; validate and consume
-                        }
-                        else
-                        {
-                            goto AfterReadDWord; // probably ASCII punctuation or whitespace
-                        }
-                    }
-                    else
-                    {
-                        goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
-                    }
-                }
-
-                // Assume the 4-byte case, but we need to validate.
-
-                {
-                    // We need to check for overlong or invalid (over U+10FFFF) four-byte sequences.
-                    //
-                    // Per Table 3-7, valid sequences are:
-                    // [   F0   ] [ 90..BF ] [ 80..BF ] [ 80..BF ]
-                    // [ F1..F3 ] [ 80..BF ] [ 80..BF ] [ 80..BF ]
-                    // [   F4   ] [ 80..8F ] [ 80..BF ] [ 80..BF ]
-
-                    if (!Utf8Utility.UInt32BeginsWithUtf8FourByteMask(thisDWord))
-                    {
-                        goto Error;
-                    }
-
-                    // Now check for overlong / out-of-range sequences.
-
-                    if (BitConverter.IsLittleEndian)
-                    {
-                        // The DWORD we read is [ 10xxxxxx 10yyyyyy 10zzzzzz 11110www ].
-                        // We want to get the 'w' byte in front of the 'z' byte so that we can perform
-                        // a single range comparison. We'll take advantage of the fact that the JITter
-                        // can detect a ROR / ROL operation, then we'll just zero out the bytes that
-                        // aren't involved in the range check.
-
-                        uint toCheck = thisDWord & 0x0000_FFFFu;
-
-                        // At this point, toCheck = [ 00000000 00000000 10zzzzzz 11110www ].
-
-                        toCheck = BitOperations.RotateRight(toCheck, 8);
-
-                        // At this point, toCheck = [ 11110www 00000000 00000000 10zzzzzz ].
-
-                        if (!UnicodeUtility.IsInRangeInclusive(toCheck, 0xF000_0090u, 0xF400_008Fu))
-                        {
-                            goto Error;
-                        }
-                    }
-                    else
-                    {
-                        if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0xF090_0000u, 0xF48F_FFFFu))
-                        {
-                            goto Error;
-                        }
-                    }
-
-                    // Validation complete.
-
-                    if (outputCharsRemaining < 2)
-                    {
-                        // There's no point to falling back to the "drain the input buffer" logic, since we know
-                        // we can't write anything to the destination. So we'll just exit immediately.
-                        goto OutputBufferTooSmall;
-                    }
-
-                    Unsafe.WriteUnaligned<uint>(pOutputBuffer, Utf8Utility.ExtractCharsFromFourByteSequence(thisDWord));
-
-                    pInputBuffer += 4;
-                    pOutputBuffer += 2;
-                    outputCharsRemaining -= 2;
-
-                    continue; // go back to beginning of loop for processing
-                }
-            }
-
-        ProcessRemainingBytesSlow:
-            inputLength = (int)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4;
-
-        ProcessInputOfLessThanDWordSize:
-            while (inputLength > 0)
-            {
-                uint firstByte = pInputBuffer[0];
-                if (firstByte <= 0x7Fu)
-                {
-                    if (0u >= (uint)outputCharsRemaining)
-                    {
-                        goto OutputBufferTooSmall; // we have no hope of writing anything to the output
-                    }
-
-                    // 1-byte (ASCII) case
-                    *pOutputBuffer = (char)firstByte;
-
-                    pInputBuffer += 1;
-                    pOutputBuffer += 1;
-                    inputLength -= 1;
-                    outputCharsRemaining -= 1;
-                    continue;
-                }
-
-                // Potentially the start of a multi-byte sequence?
-
-                firstByte -= 0xC2u;
-                if ((byte)firstByte <= (0xDFu - 0xC2u))
-                {
-                    // Potentially a 2-byte sequence?
-                    if (inputLength < 2)
-                    {
-                        goto InputBufferTooSmall; // out of data
-                    }
-
-                    uint secondByte = pInputBuffer[1];
-                    if (!Utf8Utility.IsLowByteUtf8ContinuationByte(secondByte))
-                    {
-                        goto Error; // 2-byte marker not followed by continuation byte
-                    }
-
-                    if (0u >= (uint)outputCharsRemaining)
-                    {
-                        goto OutputBufferTooSmall; // we have no hope of writing anything to the output
-                    }
-
-                    uint asChar = (firstByte << 6) + secondByte + ((0xC2u - 0xC0u) << 6) - 0x80u; // remove UTF-8 markers from scalar
-                    *pOutputBuffer = (char)asChar;
-
-                    pInputBuffer += 2;
-                    pOutputBuffer += 1;
-                    inputLength -= 2;
-                    outputCharsRemaining -= 1;
-                    continue;
-                }
-                else if ((byte)firstByte <= (0xEFu - 0xC2u))
-                {
-                    // Potentially a 3-byte sequence?
-                    if (inputLength >= 3)
-                    {
-                        uint secondByte = pInputBuffer[1];
-                        uint thirdByte = pInputBuffer[2];
-                        if (!Utf8Utility.IsLowByteUtf8ContinuationByte(secondByte) || !Utf8Utility.IsLowByteUtf8ContinuationByte(thirdByte))
-                        {
-                            goto Error; // 3-byte marker not followed by 2 continuation bytes
-                        }
-
-                        // To speed up the validation logic below, we're not going to remove the UTF-8 markers from the partial char just yet.
-                        // We account for this in the comparisons below.
-
-                        uint partialChar = (firstByte << 12) + (secondByte << 6);
-                        if (partialChar < ((0xE0u - 0xC2u) << 12) + (0xA0u << 6))
-                        {
-                            goto Error; // this is an overlong encoding; fail
-                        }
-
-                        partialChar -= ((0xEDu - 0xC2u) << 12) + (0xA0u << 6); //if partialChar = 0, we're at beginning of UTF-16 surrogate code point range
-                        if (partialChar < (0x0800u /* number of code points in UTF-16 surrogate code point range */))
-                        {
-                            goto Error; // attempted to encode a UTF-16 surrogate code point; fail
-                        }
-
-                        if (0u >= (uint)outputCharsRemaining)
-                        {
-                            goto OutputBufferTooSmall; // we have no hope of writing anything to the output
-                        }
-
-                        // Now restore the full scalar value.
-
-                        partialChar += thirdByte;
-                        partialChar += 0xD800; // undo "move to beginning of UTF-16 surrogate code point range" from earlier, fold it with later adds
-                        partialChar -= 0x80u; // remove third byte continuation marker
-
-                        *pOutputBuffer = (char)partialChar;
-
-                        pInputBuffer += 3;
-                        pOutputBuffer += 1;
-                        inputLength -= 3;
-                        outputCharsRemaining -= 1;
-                        continue;
-                    }
-                    else if (inputLength >= 2)
-                    {
-                        uint secondByte = pInputBuffer[1];
-                        if (!Utf8Utility.IsLowByteUtf8ContinuationByte(secondByte))
-                        {
-                            goto Error; // 3-byte marker not followed by continuation byte
-                        }
-
-                        // We can't build up the entire scalar value now, but we can check for overlong / surrogate representations
-                        // from just the first two bytes.
-
-                        uint partialChar = (firstByte << 6) + secondByte; // don't worry about fixing up the UTF-8 markers; we'll account for it in the below comparison
-                        if (partialChar < ((0xE0u - 0xC2u) << 6) + 0xA0u)
-                        {
-                            goto Error; // failed overlong check
-                        }
-                        if (UnicodeUtility.IsInRangeInclusive(partialChar, ((0xEDu - 0xC2u) << 6) + 0xA0u, ((0xEEu - 0xC2u) << 6) + 0x7Fu))
-                        {
-                            goto Error; // failed surrogate check
-                        }
-                    }
-
-                    goto InputBufferTooSmall; // out of data
-                }
-                else if ((byte)firstByte <= (0xF4u - 0xC2u))
-                {
-                    // Potentially a 4-byte sequence?
-
-                    if (inputLength < 2)
-                    {
-                        goto InputBufferTooSmall; // ran out of data
-                    }
-
-                    uint nextByte = pInputBuffer[1];
-                    if (!Utf8Utility.IsLowByteUtf8ContinuationByte(nextByte))
-                    {
-                        goto Error; // 4-byte marker not followed by a continuation byte
-                    }
-
-                    uint asPartialChar = (firstByte << 6) + nextByte; // don't worry about fixing up the UTF-8 markers; we'll account for it in the below comparison
-                    if (!UnicodeUtility.IsInRangeInclusive(asPartialChar, ((0xF0u - 0xC2u) << 6) + 0x90u, ((0xF4u - 0xC2u) << 6) + 0x8Fu))
-                    {
-                        goto Error; // failed overlong / out-of-range check
-                    }
-
-                    if (inputLength < 3)
-                    {
-                        goto InputBufferTooSmall; // ran out of data
-                    }
-
-                    if (!Utf8Utility.IsLowByteUtf8ContinuationByte(pInputBuffer[2]))
-                    {
-                        goto Error; // third byte in 4-byte sequence not a continuation byte
-                    }
-
-                    if (inputLength < 4)
-                    {
-                        goto InputBufferTooSmall; // ran out of data
-                    }
-
-                    if (!Utf8Utility.IsLowByteUtf8ContinuationByte(pInputBuffer[3]))
-                    {
-                        goto Error; // fourth byte in 4-byte sequence not a continuation byte
-                    }
-
-                    // If we read a valid astral scalar value, the only way we could've fallen down this code path
-                    // is that we didn't have enough output buffer to write the result.
-
-                    goto OutputBufferTooSmall;
-                }
-                else
-                {
-                    goto Error; // didn't begin with [ C2 .. F4 ], so invalid multi-byte sequence header byte
-                }
-            }
-
-            OperationStatus retVal = OperationStatus.Done;
-            goto ReturnCommon;
-
-        InputBufferTooSmall:
-            retVal = OperationStatus.NeedMoreData;
-            goto ReturnCommon;
-
-        OutputBufferTooSmall:
-            retVal = OperationStatus.DestinationTooSmall;
-            goto ReturnCommon;
-
-        Error:
-            retVal = OperationStatus.InvalidData;
-            goto ReturnCommon;
-
-        ReturnCommon:
-            pInputBufferRemaining = pInputBuffer;
-            pOutputBufferRemaining = pOutputBuffer;
-            return retVal;
-        }
-
-        // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where
-        // the next char would have been consumed from / the next byte would have been written to.
-        // inputLength in chars, outputBytesRemaining in bytes.
-        public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLength, byte* pOutputBuffer, int outputBytesRemaining, out char* pInputBufferRemaining, out byte* pOutputBufferRemaining)
-        {
-            const int CharsPerDWord = sizeof(uint) / sizeof(char);
-
-            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
-            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
-
-            Debug.Assert(outputBytesRemaining >= 0, "Destination length must not be negative.");
-            Debug.Assert(pOutputBuffer != null || outputBytesRemaining == 0, "Destination length must be zero if destination buffer pointer is null.");
-
-            // First, try vectorized conversion.
-
-            {
-                nuint numElementsConverted = ASCIIUtility32.NarrowUtf16ToAscii(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputBytesRemaining));
-
-                pInputBuffer += numElementsConverted;
-                pOutputBuffer += numElementsConverted;
-
-                // Quick check - did we just end up consuming the entire input buffer?
-                // If so, short-circuit the remainder of the method.
-
-                if ((int)numElementsConverted == inputLength)
-                {
-                    pInputBufferRemaining = pInputBuffer;
-                    pOutputBufferRemaining = pOutputBuffer;
-                    return OperationStatus.Done;
-                }
-
-                inputLength -= (int)numElementsConverted;
-                outputBytesRemaining -= (int)numElementsConverted;
-            }
-
-            if (inputLength < CharsPerDWord)
-            {
-                goto ProcessInputOfLessThanDWordSize;
-            }
-
-            char* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - CharsPerDWord;
-
-            // Begin the main loop.
-
-#if DEBUG
-            char* pLastBufferPosProcessed = null; // used for invariant checking in debug builds
-#endif
-
-            uint thisDWord;
-
-            while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
-            {
-                // Read 32 bits at a time. This is enough to hold any possible UTF16-encoded scalar.
-
-                thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-
-            AfterReadDWord:
-
-#if DEBUG
-                Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read.");
-                pLastBufferPosProcessed = pInputBuffer;
-#endif
-
-                // First, check for the common case of all-ASCII chars.
-
-                if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
-                {
-                    // We read an all-ASCII sequence (2 chars).
-
-                    if (outputBytesRemaining < 2)
-                    {
-                        goto ProcessOneCharFromCurrentDWordAndFinish; // running out of space, but may be able to write some data
-                    }
-
-                    // The high WORD of the local declared below might be populated with garbage
-                    // as a result of our shifts below, but that's ok since we're only going to
-                    // write the low WORD.
-                    //
-                    // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
-                    // (Same logic works regardless of endianness.)
-                    uint valueToWrite = thisDWord | (thisDWord >> 8);
-
-                    Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)valueToWrite);
-
-                    pInputBuffer += 2;
-                    pOutputBuffer += 2;
-                    outputBytesRemaining -= 2;
-
-                    // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII.
-                    // Below is basically unrolled loops with poor man's vectorization.
-
-                    uint inputCharsRemaining = (uint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + 2;
-                    uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining);
-
-                    if (Bmi2.X64.IsSupported)
-                    {
-                        Debug.Assert(BitConverter.IsLittleEndian, "BMI2 requires little-endian.");
-                        const ulong PEXT_MASK = 0x00FF00FF_00FF00FFul;
-
-                        // Try reading and writing 8 elements per iteration.
-                        uint maxIters = minElementsRemaining / 8;
-                        ulong firstQWord, secondQWord;
-                        int i;
-                        for (i = 0; (uint)i < maxIters; i++)
-                        {
-                            firstQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
-                            secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer + 4);
-
-                            if (!Utf16Utility.AllCharsInUInt64AreAscii(firstQWord | secondQWord))
-                            {
-                                goto LoopTerminatedDueToNonAsciiData;
-                            }
-
-                            Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK));
-                            Unsafe.WriteUnaligned<uint>(pOutputBuffer + 4, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK));
-
-                            pInputBuffer += 8;
-                            pOutputBuffer += 8;
-                        }
-
-                        outputBytesRemaining -= 8 * i;
-
-                        // Can we perform one more iteration, but reading & writing 4 elements instead of 8?
-
-                        if ((minElementsRemaining & 4) != 0)
-                        {
-                            secondQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
-
-                            if (!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord))
-                            {
-                                goto LoopTerminatedDueToNonAsciiDataInSecondQWord;
-                            }
-
-                            Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(secondQWord, PEXT_MASK));
-
-                            pInputBuffer += 4;
-                            pOutputBuffer += 4;
-                            outputBytesRemaining -= 4;
-                        }
-
-                        continue; // Go back to beginning of main loop, read data, check for ASCII
-
-                    LoopTerminatedDueToNonAsciiData:
-
-                        outputBytesRemaining -= 8 * i;
-
-                        // First, see if we can drain any ASCII data from the first QWORD.
-
-                        if (Utf16Utility.AllCharsInUInt64AreAscii(firstQWord))
-                        {
-                            Unsafe.WriteUnaligned<uint>(pOutputBuffer, (uint)Bmi2.X64.ParallelBitExtract(firstQWord, PEXT_MASK));
-                            pInputBuffer += 4;
-                            pOutputBuffer += 4;
-                            outputBytesRemaining -= 4;
-                        }
-                        else
-                        {
-                            secondQWord = firstQWord;
-                        }
-
-                    LoopTerminatedDueToNonAsciiDataInSecondQWord:
-
-                        Debug.Assert(!Utf16Utility.AllCharsInUInt64AreAscii(secondQWord)); // this condition should've been checked earlier
-
-                        thisDWord = (uint)secondQWord;
-                        if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
-                        {
-                            // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
-                            Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8)));
-                            pInputBuffer += 2;
-                            pOutputBuffer += 2;
-                            outputBytesRemaining -= 2;
-                            thisDWord = (uint)(secondQWord >> 32);
-                        }
-
-                        goto AfterReadDWordSkipAllCharsAsciiCheck;
-                    }
-                    else
-                    {
-                        // Can't use BMI2 x64, so we'll only read and write 4 elements per iteration.
-                        uint maxIters = minElementsRemaining / 4;
-                        uint secondDWord;
-                        int i;
-                        for (i = 0; (uint)i < maxIters; i++)
-                        {
-                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-                            secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + 2);
-
-                            if (!Utf16Utility.AllCharsInUInt32AreAscii(thisDWord | secondDWord))
-                            {
-                                goto LoopTerminatedDueToNonAsciiData;
-                            }
-
-                            // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
-                            // (Same logic works regardless of endianness.)
-                            Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8)));
-                            Unsafe.WriteUnaligned<ushort>(pOutputBuffer + 2, (ushort)(secondDWord | (secondDWord >> 8)));
-
-                            pInputBuffer += 4;
-                            pOutputBuffer += 4;
-                        }
-
-                        outputBytesRemaining -= 4 * i;
-
-                        continue; // Go back to beginning of main loop, read data, check for ASCII
-
-                    LoopTerminatedDueToNonAsciiData:
-
-                        outputBytesRemaining -= 4 * i;
-
-                        // First, see if we can drain any ASCII data from the first DWORD.
-
-                        if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
-                        {
-                            // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
-                            // (Same logic works regardless of endianness.)
-                            Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8)));
-                            pInputBuffer += 2;
-                            pOutputBuffer += 2;
-                            outputBytesRemaining -= 2;
-                            thisDWord = secondDWord;
-                        }
-
-                        goto AfterReadDWordSkipAllCharsAsciiCheck;
-                    }
-                }
-
-            AfterReadDWordSkipAllCharsAsciiCheck:
-
-                Debug.Assert(!Utf16Utility.AllCharsInUInt32AreAscii(thisDWord)); // this should have been handled earlier
-
-                // Next, try stripping off the first ASCII char if it exists.
-                // We don't check for a second ASCII char since that should have been handled above.
-
-                if (Utf8Utility.IsFirstCharAscii(thisDWord))
-                {
-                    if (0u >= (uint)outputBytesRemaining)
-                    {
-                        goto OutputBufferTooSmall;
-                    }
-
-                    if (BitConverter.IsLittleEndian)
-                    {
-                        pOutputBuffer[0] = (byte)thisDWord; // extract [ ## ## 00 AA ]
-                    }
-                    else
-                    {
-                        pOutputBuffer[0] = (byte)(thisDWord >> 24); // extract [ AA 00 ## ## ]
-                    }
-
-                    pInputBuffer += 1;
-                    pOutputBuffer += 1;
-                    outputBytesRemaining -= 1;
-
-                    if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
-                    {
-                        goto ProcessNextCharAndFinish; // input buffer doesn't contain enough data to read a DWORD
-                    }
-                    else
-                    {
-                        // The input buffer at the current offset contains a non-ASCII char.
-                        // Read an entire DWORD and fall through to non-ASCII consumption logic.
-                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-                    }
-                }
-
-                // At this point, we know the first char in the buffer is non-ASCII, but we haven't yet validated it.
-
-                if (!Utf8Utility.IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
-                {
-                TryConsumeMultipleTwoByteSequences:
-
-                    // For certain text (Greek, Cyrillic, ...), 2-byte sequences tend to be clustered. We'll try transcoding them in
-                    // a tight loop without falling back to the main loop.
-
-                    if (Utf8Utility.IsSecondCharTwoUtf8Bytes(thisDWord))
-                    {
-                        // We have two runs of two bytes each.
-
-                        if (outputBytesRemaining < 4)
-                        {
-                            goto ProcessOneCharFromCurrentDWordAndFinish; // running out of output buffer
-                        }
-
-                        Unsafe.WriteUnaligned<uint>(pOutputBuffer, Utf8Utility.ExtractTwoUtf8TwoByteSequencesFromTwoPackedUtf16Chars(thisDWord));
-
-                        pInputBuffer += 2;
-                        pOutputBuffer += 4;
-                        outputBytesRemaining -= 4;
-
-                        if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
-                        {
-                            goto ProcessNextCharAndFinish; // Running out of data - go down slow path
-                        }
-                        else
-                        {
-                            // Optimization: If we read a long run of two-byte sequences, the next sequence is probably
-                            // also two bytes. Check for that first before going back to the beginning of the loop.
-
-                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-
-                            if (Utf8Utility.IsFirstCharTwoUtf8Bytes(thisDWord))
-                            {
-                                // Validated we have a two-byte sequence coming up
-                                goto TryConsumeMultipleTwoByteSequences;
-                            }
-
-                            // If we reached this point, the next sequence is something other than a valid
-                            // two-byte sequence, so go back to the beginning of the loop.
-                            goto AfterReadDWord;
-                        }
-                    }
-
-                    if (outputBytesRemaining < 2)
-                    {
-                        goto OutputBufferTooSmall;
-                    }
-
-                    Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)Utf8Utility.ExtractUtf8TwoByteSequenceFromFirstUtf16Char(thisDWord));
-
-                    // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence.
-                    // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
-                    // char is ASCII?
-
-                    if (Utf8Utility.IsSecondCharAscii(thisDWord))
-                    {
-                        if (outputBytesRemaining >= 3)
-                        {
-                            if (BitConverter.IsLittleEndian)
-                            {
-                                thisDWord >>= 16;
-                            }
-                            pOutputBuffer[2] = (byte)thisDWord;
-
-                            pInputBuffer += 2;
-                            pOutputBuffer += 3;
-                            outputBytesRemaining -= 3;
-
-                            continue; // go back to original bounds check and check for ASCII
-                        }
-                        else
-                        {
-                            pInputBuffer += 1;
-                            pOutputBuffer += 2;
-                            goto OutputBufferTooSmall;
-                        }
-                    }
-                    else
-                    {
-                        pInputBuffer += 1;
-                        pOutputBuffer += 2;
-                        outputBytesRemaining -= 2;
-
-                        if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
-                        {
-                            goto ProcessNextCharAndFinish; // Running out of data - go down slow path
-                        }
-                        else
-                        {
-                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-                            goto BeforeProcessThreeByteSequence; // we know the next byte isn't ASCII, and it's not the start of a 2-byte sequence (this was checked above)
-                        }
-                    }
-                }
-
-            // Check the 3-byte case.
-
-            BeforeProcessThreeByteSequence:
-
-                if (!Utf8Utility.IsFirstCharSurrogate(thisDWord))
-                {
-                    // Optimization: A three-byte character could indicate CJK text, which makes it likely
-                    // that the character following this one is also CJK. We'll perform the check now
-                    // rather than jumping to the beginning of the main loop.
-
-                    if (Utf8Utility.IsSecondCharAtLeastThreeUtf8Bytes(thisDWord))
-                    {
-                        if (!Utf8Utility.IsSecondCharSurrogate(thisDWord))
-                        {
-                            if (outputBytesRemaining < 6)
-                            {
-                                goto ConsumeSingleThreeByteRun; // not enough space - try consuming as much as we can
-                            }
-
-                            Utf8Utility.WriteTwoUtf16CharsAsTwoUtf8ThreeByteSequences(ref *pOutputBuffer, thisDWord);
-
-                            pInputBuffer += 2;
-                            pOutputBuffer += 6;
-                            outputBytesRemaining -= 6;
-
-                            // Try to remain in the 3-byte processing loop if at all possible.
-
-                            if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
-                            {
-                                goto ProcessNextCharAndFinish; // Running out of data - go down slow path
-                            }
-                            else
-                            {
-                                thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-
-                                if (Utf8Utility.IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
-                                {
-                                    goto BeforeProcessThreeByteSequence;
-                                }
-                                else
-                                {
-                                    // Fall back to standard processing loop since we don't know how to optimize this.
-                                    goto AfterReadDWord;
-                                }
-                            }
-                        }
-                    }
-
-                ConsumeSingleThreeByteRun:
-
-                    if (outputBytesRemaining < 3)
-                    {
-                        goto OutputBufferTooSmall;
-                    }
-
-                    Utf8Utility.WriteFirstUtf16CharAsUtf8ThreeByteSequence(ref *pOutputBuffer, thisDWord);
-
-                    pInputBuffer += 1;
-                    pOutputBuffer += 3;
-                    outputBytesRemaining -= 3;
-
-                    // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way
-                    // in to the text. If this happens strip it off now before seeing if the next character
-                    // consists of three code units.
-
-                    if (Utf8Utility.IsSecondCharAscii(thisDWord))
-                    {
-                        if (0u >= (uint)outputBytesRemaining)
-                        {
-                            goto OutputBufferTooSmall;
-                        }
-
-                        if (BitConverter.IsLittleEndian)
-                        {
-                            *pOutputBuffer = (byte)(thisDWord >> 16);
-                        }
-                        else
-                        {
-                            *pOutputBuffer = (byte)(thisDWord);
-                        }
-
-                        pInputBuffer += 1;
-                        pOutputBuffer += 1;
-                        outputBytesRemaining -= 1;
-
-                        if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
-                        {
-                            goto ProcessNextCharAndFinish; // Running out of data - go down slow path
-                        }
-                        else
-                        {
-                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-
-                            if (Utf8Utility.IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
-                            {
-                                goto BeforeProcessThreeByteSequence;
-                            }
-                            else
-                            {
-                                // Fall back to standard processing loop since we don't know how to optimize this.
-                                goto AfterReadDWord;
-                            }
-                        }
-                    }
-
-                    if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
-                    {
-                        goto ProcessNextCharAndFinish; // Running out of data - go down slow path
-                    }
-                    else
-                    {
-                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-                        goto AfterReadDWordSkipAllCharsAsciiCheck; // we just checked above that this value isn't ASCII
-                    }
-                }
-
-                // Four byte sequence processing
-
-                if (Utf8Utility.IsWellFormedUtf16SurrogatePair(thisDWord))
-                {
-                    if (outputBytesRemaining < 4)
-                    {
-                        goto OutputBufferTooSmall;
-                    }
-
-                    Unsafe.WriteUnaligned<uint>(pOutputBuffer, Utf8Utility.ExtractFourUtf8BytesFromSurrogatePair(thisDWord));
-
-                    pInputBuffer += 2;
-                    pOutputBuffer += 4;
-                    outputBytesRemaining -= 4;
-
-                    continue; // go back to beginning of loop for processing
-                }
-
-                goto Error; // an ill-formed surrogate sequence: high not followed by low, or low not preceded by high
-            }
-
-        ProcessNextCharAndFinish:
-            inputLength = (int)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + CharsPerDWord;
-
-        ProcessInputOfLessThanDWordSize:
-            Debug.Assert(inputLength < CharsPerDWord);
-
-            if (0u >= (uint)inputLength)
-            {
-                goto InputBufferFullyConsumed;
-            }
-
-            uint thisChar = *pInputBuffer;
-            goto ProcessFinalChar;
-
-        ProcessOneCharFromCurrentDWordAndFinish:
-            if (BitConverter.IsLittleEndian)
-            {
-                thisChar = thisDWord & 0xFFFFu; // preserve only the first char
-            }
-            else
-            {
-                thisChar = thisDWord >> 16; // preserve only the first char
-            }
-
-        ProcessFinalChar:
-            {
-                if (thisChar <= 0x7Fu)
-                {
-                    if (0u >= (uint)outputBytesRemaining)
-                    {
-                        goto OutputBufferTooSmall; // we have no hope of writing anything to the output
-                    }
-
-                    // 1-byte (ASCII) case
-                    *pOutputBuffer = (byte)thisChar;
-
-                    pInputBuffer += 1;
-                    pOutputBuffer += 1;
-                }
-                else if (thisChar < 0x0800u)
-                {
-                    if (outputBytesRemaining < 2)
-                    {
-                        goto OutputBufferTooSmall; // we have no hope of writing anything to the output
-                    }
-
-                    // 2-byte case
-                    pOutputBuffer[1] = (byte)((thisChar & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10xxxxxx ]
-                    pOutputBuffer[0] = (byte)((thisChar >> 6) | unchecked((uint)(sbyte)0xC0)); // [ 110yyyyy ]
-
-                    pInputBuffer += 1;
-                    pOutputBuffer += 2;
-                }
-                else if (!UnicodeUtility.IsSurrogateCodePoint(thisChar))
-                {
-                    if (outputBytesRemaining < 3)
-                    {
-                        goto OutputBufferTooSmall; // we have no hope of writing anything to the output
-                    }
-
-                    // 3-byte case
-                    pOutputBuffer[2] = (byte)((thisChar & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10xxxxxx ]
-                    pOutputBuffer[1] = (byte)(((thisChar >> 6) & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10yyyyyy ]
-                    pOutputBuffer[0] = (byte)((thisChar >> 12) | unchecked((uint)(sbyte)0xE0)); // [ 1110zzzz ]
-
-                    pInputBuffer += 1;
-                    pOutputBuffer += 3;
-                }
-                else if (thisChar <= 0xDBFFu)
-                {
-                    // UTF-16 high surrogate code point with no trailing data, report incomplete input buffer
-                    goto InputBufferTooSmall;
-                }
-                else
-                {
-                    // UTF-16 low surrogate code point with no leading data, report error
-                    goto Error;
-                }
-            }
-
-            // There are two ways we can end up here. Either we were running low on input data,
-            // or we were running low on space in the destination buffer. If we're running low on
-            // input data (label targets ProcessInputOfLessThanDWordSize and ProcessNextCharAndFinish),
-            // then the inputLength value is guaranteed to be between 0 and 1, and we should return Done.
-            // If we're running low on destination buffer space (label target ProcessOneCharFromCurrentDWordAndFinish),
-            // then we didn't modify inputLength since entering the main loop, which means it should
-            // still have a value of >= 2. So checking the value of inputLength is all we need to do to determine
-            // which of the two scenarios we're in.
-
-            if (inputLength > 1)
-            {
-                goto OutputBufferTooSmall;
-            }
-
-        InputBufferFullyConsumed:
-            OperationStatus retVal = OperationStatus.Done;
-            goto ReturnCommon;
-
-        InputBufferTooSmall:
-            retVal = OperationStatus.NeedMoreData;
-            goto ReturnCommon;
-
-        OutputBufferTooSmall:
-            retVal = OperationStatus.DestinationTooSmall;
-            goto ReturnCommon;
-
-        Error:
-            retVal = OperationStatus.InvalidData;
-            goto ReturnCommon;
-
-        ReturnCommon:
-            pInputBufferRemaining = pInputBuffer;
-            pOutputBufferRemaining = pOutputBuffer;
-            return retVal;
-        }
-    }
-}
-#endif
diff --git a/src/DotNetty.Common/Internal/Utf8Utility32.Validation.cs b/src/DotNetty.Common/Internal/Utf8Utility32.Validation.cs
deleted file mode 100644
index 3592e8d9d..000000000
--- a/src/DotNetty.Common/Internal/Utf8Utility32.Validation.cs
+++ /dev/null
@@ -1,736 +0,0 @@
-﻿// borrowed from https://github.com/dotnet/corefx/tree/release/3.1/src/Common/src/CoreLib/System/Text/Unicode
-
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-#if NETCOREAPP_3_0_GREATER
-using System;
-using System.Diagnostics;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.Intrinsics.X86;
-using nint = System.Int32;
-using nuint = System.UInt32;
-
-namespace DotNetty.Common.Internal
-{
-    internal static unsafe partial class Utf8Utility32
-    {
-#if DEBUG
-        private static void _ValidateAdditionalNIntDefinitions()
-        {
-            Debug.Assert(sizeof(nint) == IntPtr.Size && nint.MinValue < 0, "nint is defined incorrectly.");
-            Debug.Assert(sizeof(nuint) == IntPtr.Size && nuint.MinValue == 0, "nuint is defined incorrectly.");
-        }
-#endif // DEBUG
-
-        // Returns &inputBuffer[inputLength] if the input buffer is valid.
-        /// <summary>
-        /// Given an input buffer <paramref name="pInputBuffer"/> of byte length <paramref name="inputLength"/>,
-        /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>.
-        /// </summary>
-        /// <remarks>
-        /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed.
-        /// </remarks>
-        public static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
-        {
-            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
-            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
-
-            // First, try to drain off as many ASCII bytes as we can from the beginning.
-
-            {
-                nuint numAsciiBytesCounted = ASCIIUtility32.GetIndexOfFirstNonAsciiByte(pInputBuffer, (uint)inputLength);
-                pInputBuffer += numAsciiBytesCounted;
-
-                // Quick check - did we just end up consuming the entire input buffer?
-                // If so, short-circuit the remainder of the method.
-
-                inputLength -= (int)numAsciiBytesCounted;
-                if (0u >= (uint)inputLength)
-                {
-                    utf16CodeUnitCountAdjustment = 0;
-                    scalarCountAdjustment = 0;
-                    return pInputBuffer;
-                }
-            }
-
-#if DEBUG
-            // Keep these around for final validation at the end of the method.
-            byte* pOriginalInputBuffer = pInputBuffer;
-            int originalInputLength = inputLength;
-#endif
-
-            // Enregistered locals that we'll eventually out to our caller.
-
-            int tempUtf16CodeUnitCountAdjustment = 0;
-            int tempScalarCountAdjustment = 0;
-
-            if (inputLength < sizeof(uint))
-            {
-                goto ProcessInputOfLessThanDWordSize;
-            }
-
-            byte* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - sizeof(uint);
-
-            // Begin the main loop.
-
-#if DEBUG
-            byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds
-#endif
-
-            while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
-            {
-                // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar.
-
-                uint thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-
-            AfterReadDWord:
-
-#if DEBUG
-                Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read.");
-                pLastBufferPosProcessed = pInputBuffer;
-#endif
-
-                // First, check for the common case of all-ASCII bytes.
-
-                if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
-                {
-                    // We read an all-ASCII sequence.
-
-                    pInputBuffer += sizeof(uint);
-
-                    // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII.
-                    // Below is basically unrolled loops with poor man's vectorization.
-
-                    // Below check is "can I read at least five DWORDs from the input stream?"
-                    // n.b. Since we incremented pInputBuffer above the below subtraction may result in a negative value,
-                    // hence using nint instead of nuint.
-
-                    if ((nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 4 * sizeof(uint))
-                    {
-                        // We want reads in the inner loop to be aligned. So let's perform a quick
-                        // ASCII check of the next 32 bits (4 bytes) now, and if that succeeds bump
-                        // the read pointer up to the next aligned address.
-
-                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-                        if (!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
-                        {
-                            goto AfterReadDWordSkipAllBytesAsciiCheck;
-                        }
-
-                        pInputBuffer = (byte*)((nuint)(pInputBuffer + 4) & ~(nuint)3);
-
-                        // At this point, the input buffer offset points to an aligned DWORD. We also know that there's
-                        // enough room to read at least four DWORDs from the buffer. (Heed the comment a few lines above:
-                        // the original 'if' check confirmed that there were 5 DWORDs before the alignment check, and
-                        // the alignment check consumes at most a single DWORD.)
-
-                        byte* pInputBufferFinalPosAtWhichCanSafelyLoop = pFinalPosWhereCanReadDWordFromInputBuffer - 3 * sizeof(uint); // can safely read 4 DWORDs here
-                        uint mask;
-
-                        do
-                        {
-                            if (Sse2.IsSupported && Bmi1.IsSupported)
-                            {
-                                // pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're
-                                // going to perform an unaligned load. We don't necessarily care about aligning
-                                // this because we pessimistically assume we'll encounter non-ASCII data at some
-                                // point in the not-too-distant future (otherwise we would've stayed entirely
-                                // within the all-ASCII vectorized code at the entry to this method).
-
-                                mask = (uint)Sse2.MoveMask(Sse2.LoadVector128((byte*)pInputBuffer));
-                                if (mask != 0)
-                                {
-                                    goto Sse2LoopTerminatedEarlyDueToNonAsciiData;
-                                }
-                            }
-                            else
-                            {
-                                if (!ASCIIUtility.AllBytesInUInt32AreAscii(((uint*)pInputBuffer)[0] | ((uint*)pInputBuffer)[1]))
-                                {
-                                    goto LoopTerminatedEarlyDueToNonAsciiDataInFirstPair;
-                                }
-
-                                if (!ASCIIUtility.AllBytesInUInt32AreAscii(((uint*)pInputBuffer)[2] | ((uint*)pInputBuffer)[3]))
-                                {
-                                    goto LoopTerminatedEarlyDueToNonAsciiDataInSecondPair;
-                                }
-                            }
-
-                            pInputBuffer += 4 * sizeof(uint); // consumed 4 DWORDs
-                        } while (pInputBuffer <= pInputBufferFinalPosAtWhichCanSafelyLoop);
-
-                        continue; // need to perform a bounds check because we might be running out of data
-
-                    Sse2LoopTerminatedEarlyDueToNonAsciiData:
-
-                        Debug.Assert(BitConverter.IsLittleEndian);
-                        Debug.Assert(Sse2.IsSupported);
-                        Debug.Assert(Bmi1.IsSupported);
-
-                        // The 'mask' value will have a 0 bit for each ASCII byte we saw and a 1 bit
-                        // for each non-ASCII byte we saw. We can count the number of ASCII bytes,
-                        // bump our input counter by that amount, and resume processing from the
-                        // "the first byte is no longer ASCII" portion of the main loop.
-
-                        Debug.Assert(mask != 0);
-
-                        pInputBuffer += Bmi1.TrailingZeroCount(mask);
-                        if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
-                        {
-                            goto ProcessRemainingBytesSlow;
-                        }
-
-                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer); // no longer guaranteed to be aligned
-                        goto BeforeProcessTwoByteSequence;
-
-                    LoopTerminatedEarlyDueToNonAsciiDataInSecondPair:
-
-                        pInputBuffer += 2 * sizeof(uint); // consumed 2 DWORDs
-
-                    LoopTerminatedEarlyDueToNonAsciiDataInFirstPair:
-
-                        // We know that there's *at least* two DWORDs of data remaining in the buffer.
-                        // We also know that one of them (or both of them) contains non-ASCII data somewhere.
-                        // Let's perform a quick check here to bypass the logic at the beginning of the main loop.
-
-                        thisDWord = *(uint*)pInputBuffer; // still aligned here
-                        if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
-                        {
-                            pInputBuffer += sizeof(uint); // consumed 1 more DWORD
-                            thisDWord = *(uint*)pInputBuffer; // still aligned here
-                        }
-
-                        goto AfterReadDWordSkipAllBytesAsciiCheck;
-                    }
-
-                    continue; // not enough data remaining to unroll loop - go back to beginning with bounds checks
-                }
-
-            AfterReadDWordSkipAllBytesAsciiCheck:
-
-                Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)); // this should have been handled earlier
-
-                // Next, try stripping off ASCII bytes one at a time.
-                // We only handle up to three ASCII bytes here since we handled the four ASCII byte case above.
-
-                {
-                    uint numLeadingAsciiBytes = ASCIIUtility.CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(thisDWord);
-                    pInputBuffer += numLeadingAsciiBytes;
-
-                    if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer)
-                    {
-                        goto ProcessRemainingBytesSlow; // Input buffer doesn't contain enough data to read a DWORD
-                    }
-                    else
-                    {
-                        // The input buffer at the current offset contains a non-ASCII byte.
-                        // Read an entire DWORD and fall through to multi-byte consumption logic.
-                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-                    }
-                }
-
-            BeforeProcessTwoByteSequence:
-
-                // At this point, we suspect we're working with a multi-byte code unit sequence,
-                // but we haven't yet validated it for well-formedness.
-
-                // The masks and comparands are derived from the Unicode Standard, Table 3-6.
-                // Additionally, we need to check for valid byte sequences per Table 3-7.
-
-                // Check the 2-byte case.
-
-                thisDWord -= (BitConverter.IsLittleEndian) ? 0x0000_80C0u : 0xC080_0000u;
-                if (0u >= (thisDWord & (BitConverter.IsLittleEndian ? 0x0000_C0E0u : 0xE0C0_0000u)))
-                {
-                    // Per Table 3-7, valid sequences are:
-                    // [ C2..DF ] [ 80..BF ]
-                    //
-                    // Due to our modification of 'thisDWord' above, this becomes:
-                    // [ 02..1F ] [ 00..3F ]
-                    //
-                    // We've already checked that the leading byte was originally in the range [ C0..DF ]
-                    // and that the trailing byte was originally in the range [ 80..BF ], so now we only need
-                    // to check that the modified leading byte is >= [ 02 ].
-
-                    if ((BitConverter.IsLittleEndian && (byte)thisDWord < 0x02u)
-                        || (!BitConverter.IsLittleEndian && thisDWord < 0x0200_0000u))
-                    {
-                        goto Error; // overlong form - leading byte was [ C0 ] or [ C1 ]
-                    }
-
-                ProcessTwoByteSequenceSkipOverlongFormCheck:
-
-                    // Optimization: If this is a two-byte-per-character language like Cyrillic or Hebrew,
-                    // there's a good chance that if we see one two-byte run then there's another two-byte
-                    // run immediately after. Let's check that now.
-
-                    // On little-endian platforms, we can check for the two-byte UTF8 mask *and* validate that
-                    // the value isn't overlong using a single comparison. On big-endian platforms, we'll need
-                    // to validate the mask and validate that the sequence isn't overlong as two separate comparisons.
-
-                    if ((BitConverter.IsLittleEndian && Utf8Utility.UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
-                        || (!BitConverter.IsLittleEndian && (Utf8Utility.UInt32EndsWithUtf8TwoByteMask(thisDWord) && !Utf8Utility.UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord))))
-                    {
-                        // We have two runs of two bytes each.
-                        pInputBuffer += 4;
-                        tempUtf16CodeUnitCountAdjustment -= 2; // 4 UTF-8 code units -> 2 UTF-16 code units (and 2 scalars)
-
-                        if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
-                        {
-                            // Optimization: If we read a long run of two-byte sequences, the next sequence is probably
-                            // also two bytes. Check for that first before going back to the beginning of the loop.
-
-                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-
-                            if (BitConverter.IsLittleEndian)
-                            {
-                                if (Utf8Utility.UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
-                                {
-                                    // The next sequence is a valid two-byte sequence.
-                                    goto ProcessTwoByteSequenceSkipOverlongFormCheck;
-                                }
-                            }
-                            else
-                            {
-                                if (Utf8Utility.UInt32BeginsWithUtf8TwoByteMask(thisDWord))
-                                {
-                                    if (Utf8Utility.UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
-                                    {
-                                        goto Error; // The next sequence purports to be a 2-byte sequence but is overlong.
-                                    }
-
-                                    goto ProcessTwoByteSequenceSkipOverlongFormCheck;
-                                }
-                            }
-
-                            // If we reached this point, the next sequence is something other than a valid
-                            // two-byte sequence, so go back to the beginning of the loop.
-                            goto AfterReadDWord;
-                        }
-                        else
-                        {
-                            goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
-                        }
-                    }
-
-                    // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence.
-                    // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
-                    // bytes are ASCII?
-
-                    tempUtf16CodeUnitCountAdjustment--; // 2-byte sequence + (some number of ASCII bytes) -> 1 UTF-16 code units (and 1 scalar) [+ trailing]
-
-                    if (Utf8Utility.UInt32ThirdByteIsAscii(thisDWord))
-                    {
-                        if (Utf8Utility.UInt32FourthByteIsAscii(thisDWord))
-                        {
-                            pInputBuffer += 4;
-                        }
-                        else
-                        {
-                            pInputBuffer += 3;
-
-                            // A two-byte sequence followed by an ASCII byte followed by a non-ASCII byte.
-                            // Read in the next DWORD and jump directly to the start of the multi-byte processing block.
-
-                            if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
-                            {
-                                thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-                                goto BeforeProcessTwoByteSequence;
-                            }
-                        }
-                    }
-                    else
-                    {
-                        pInputBuffer += 2;
-                    }
-
-                    continue;
-                }
-
-                // Check the 3-byte case.
-                // We need to restore the C0 leading byte we stripped out earlier, then we can strip out the expected E0 byte.
-
-                thisDWord -= (BitConverter.IsLittleEndian) ? (0x0080_00E0u - 0x0000_00C0u) : (0xE000_8000u - 0xC000_0000u);
-                if (0u >= (thisDWord & (BitConverter.IsLittleEndian ? 0x00C0_C0F0u : 0xF0C0_C000u)))
-                {
-                ProcessThreeByteSequenceWithCheck:
-
-                    // We assume the caller has confirmed that the bit pattern is representative of a three-byte
-                    // sequence, but it may still be overlong or surrogate. We need to check for these possibilities.
-                    //
-                    // Per Table 3-7, valid sequences are:
-                    // [   E0   ] [ A0..BF ] [ 80..BF ]
-                    // [ E1..EC ] [ 80..BF ] [ 80..BF ]
-                    // [   ED   ] [ 80..9F ] [ 80..BF ]
-                    // [ EE..EF ] [ 80..BF ] [ 80..BF ]
-                    //
-                    // Big-endian examples of using the above validation table:
-                    // E0A0 = 1110 0000 1010 0000 => invalid (overlong ) patterns are 1110 0000 100# ####
-                    // ED9F = 1110 1101 1001 1111 => invalid (surrogate) patterns are 1110 1101 101# ####
-                    // If using the bitmask ......................................... 0000 1111 0010 0000 (=0F20),
-                    // Then invalid (overlong) patterns match the comparand ......... 0000 0000 0000 0000 (=0000),
-                    // And invalid (surrogate) patterns match the comparand ......... 0000 1101 0010 0000 (=0D20).
-                    //
-                    // It's ok if the caller has manipulated 'thisDWord' (e.g., by subtracting 0xE0 or 0x80)
-                    // as long as they haven't touched the bits we're about to use in our mask checking below.
-
-                    if (BitConverter.IsLittleEndian)
-                    {
-                        // The "overlong or surrogate" check can be implemented using a single jump, but there's
-                        // some overhead to moving the bits into the correct locations in order to perform the
-                        // correct comparison, and in practice the processor's branch prediction capability is
-                        // good enough that we shouldn't bother. So we'll use two jumps instead.
-
-                        // Can't extract this check into its own helper method because JITter produces suboptimal
-                        // assembly, even with aggressive inlining.
-
-                        // Code below becomes 5 instructions: test, jz, lea, test, jz
-
-                        if ((0u >= (thisDWord & 0x0000_200Fu)) || (0u >= ((thisDWord - 0x0000_200Du) & 0x0000_200Fu)))
-                        {
-                            goto Error; // overlong or surrogate
-                        }
-                    }
-                    else
-                    {
-                        if ((0u >= (thisDWord & 0x0F20_0000u)) || (0u >= ((thisDWord - 0x0D20_0000u) & 0x0F20_0000u)))
-                        {
-                            goto Error; // overlong or surrogate
-                        }
-                    }
-
-                ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks:
-
-                    // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way
-                    // in to the text. If this happens strip it off now before seeing if the next character
-                    // consists of three code units.
-
-                    // Branchless: consume a 3-byte UTF-8 sequence and optionally an extra ASCII byte hanging off the end
-
-                    nint asciiAdjustment;
-                    if (BitConverter.IsLittleEndian)
-                    {
-                        asciiAdjustment = (int)thisDWord >> 31; // smear most significant bit across entire value
-                    }
-                    else
-                    {
-                        asciiAdjustment = (nint)(sbyte)thisDWord >> 7; // smear most significant bit of least significant byte across entire value
-                    }
-
-                    // asciiAdjustment = 0 if fourth byte is ASCII; -1 otherwise
-
-                    // Please *DO NOT* reorder the below two lines. It provides extra defense in depth in case this method
-                    // is ever changed such that pInputBuffer becomes a 'ref byte' instead of a simple 'byte*'. It's valid
-                    // to add 4 before backing up since we already checked previously that the input buffer contains at
-                    // least a DWORD's worth of data, so we're not going to run past the end of the buffer where the GC can
-                    // no longer track the reference. However, we can't back up before adding 4, since we might back up to
-                    // before the start of the buffer, and the GC isn't guaranteed to be able to track this.
-
-                    pInputBuffer += 4; // optimistically, assume consumed a 3-byte UTF-8 sequence plus an extra ASCII byte
-                    pInputBuffer += asciiAdjustment; // back up if we didn't actually consume an ASCII byte
-
-                    tempUtf16CodeUnitCountAdjustment -= 2; // 3 (or 4) UTF-8 bytes -> 1 (or 2) UTF-16 code unit (and 1 [or 2] scalar)
-
-                SuccessfullyProcessedThreeByteSequence:
-
-                    if (PlatformDependent.Is64BitProcess && BitConverter.IsLittleEndian)
-                    {
-                        // x64 little-endian optimization: A three-byte character could indicate CJK text,
-                        // which makes it likely that the character following this one is also CJK.
-                        // We'll try to process several three-byte sequences at a time.
-
-                        // The check below is really "can we read 9 bytes from the input buffer?" since 'pFinalPos...' is already offset
-                        // n.b. The subtraction below could result in a negative value (since we advanced pInputBuffer above), so
-                        // use nint instead of nuint.
-
-                        if ((nint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) >= 5)
-                        {
-                            ulong thisQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
-
-                            // Stage the next 32 bits into 'thisDWord' so that it's ready for us in case we need to jump backward
-                            // to a previous location in the loop. This offers defense against reading main memory again (which may
-                            // have been modified and could lead to a race condition).
-
-                            thisDWord = (uint)thisQWord;
-
-                            // Is this three 3-byte sequences in a row?
-                            // thisQWord = [ 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz ] [ 10xxxxxx ]
-                            //               ---- CHAR 3  ----   --------- CHAR 2 ---------   --------- CHAR 1 ---------     -CHAR 3-
-                            if ((thisQWord & 0xC0F0_C0C0_F0C0_C0F0ul) == 0x80E0_8080_E080_80E0ul && Utf8Utility.IsUtf8ContinuationByte(in pInputBuffer[8]))
-                            {
-                                // Saw a proper bitmask for three incoming 3-byte sequences, perform the
-                                // overlong and surrogate sequence checking now.
-
-                                // Check the first character.
-                                // If the first character is overlong or a surrogate, fail immediately.
-
-                                if ((0u >= ((uint)thisQWord & 0x200Fu)) || (0u >= (((uint)thisQWord - 0x200Du) & 0x200Fu)))
-                                {
-                                    goto Error;
-                                }
-
-                                // Check the second character.
-                                // At this point, we now know the first three bytes represent a well-formed sequence.
-                                // If there's an error beyond here, we'll jump back to the "process three known good bytes"
-                                // logic.
-
-                                thisQWord >>= 24;
-                                if ((0u >= ((uint)thisQWord & 0x200Fu)) || (0u >= (((uint)thisQWord - 0x200Du) & 0x200Fu)))
-                                {
-                                    goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks;
-                                }
-
-                                // Check the third character (we already checked that it's followed by a continuation byte).
-
-                                thisQWord >>= 24;
-                                if ((0u >= ((uint)thisQWord & 0x200Fu)) || (0u >= (((uint)thisQWord - 0x200Du) & 0x200Fu)))
-                                {
-                                    goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks;
-                                }
-
-                                pInputBuffer += 9;
-                                tempUtf16CodeUnitCountAdjustment -= 6; // 9 UTF-8 bytes -> 3 UTF-16 code units (and 3 scalars)
-
-                                goto SuccessfullyProcessedThreeByteSequence;
-                            }
-
-                            // Is this two 3-byte sequences in a row?
-                            // thisQWord = [ ######## ######## | 10xxxxxx 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz ]
-                            //                                   --------- CHAR 2 ---------   --------- CHAR 1 ---------
-                            if ((thisQWord & 0xC0C0_F0C0_C0F0ul) == 0x8080_E080_80E0ul)
-                            {
-                                // Saw a proper bitmask for two incoming 3-byte sequences, perform the
-                                // overlong and surrogate sequence checking now.
-
-                                // Check the first character.
-                                // If the first character is overlong or a surrogate, fail immediately.
-
-                                if ((0u >= ((uint)thisQWord & 0x200Fu)) || (0u >= (((uint)thisQWord - 0x200Du) & 0x200Fu)))
-                                {
-                                    goto Error;
-                                }
-
-                                // Check the second character.
-                                // At this point, we now know the first three bytes represent a well-formed sequence.
-                                // If there's an error beyond here, we'll jump back to the "process three known good bytes"
-                                // logic.
-
-                                thisQWord >>= 24;
-                                if ((0u >= ((uint)thisQWord & 0x200Fu)) || (0u >= (((uint)thisQWord - 0x200Du) & 0x200Fu)))
-                                {
-                                    goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks;
-                                }
-
-                                pInputBuffer += 6;
-                                tempUtf16CodeUnitCountAdjustment -= 4; // 6 UTF-8 bytes -> 2 UTF-16 code units (and 2 scalars)
-
-                                // The next byte in the sequence didn't have a 3-byte marker, so it's probably
-                                // an ASCII character. Jump back to the beginning of loop processing.
-
-                                continue;
-                            }
-
-                            if (Utf8Utility.UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
-                            {
-                                // A single three-byte sequence.
-                                goto ProcessThreeByteSequenceWithCheck;
-                            }
-                            else
-                            {
-                                // Not a three-byte sequence; perhaps ASCII?
-                                goto AfterReadDWord;
-                            }
-                        }
-                    }
-
-                    if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
-                    {
-                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
-
-                        // Optimization: A three-byte character could indicate CJK text, which makes it likely
-                        // that the character following this one is also CJK. We'll check for a three-byte sequence
-                        // marker now and jump directly to three-byte sequence processing if we see one, skipping
-                        // all of the logic at the beginning of the loop.
-
-                        if (Utf8Utility.UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
-                        {
-                            goto ProcessThreeByteSequenceWithCheck; // Found another [not yet validated] three-byte sequence; process
-                        }
-                        else
-                        {
-                            goto AfterReadDWord; // Probably ASCII punctuation or whitespace; go back to start of loop
-                        }
-                    }
-                    else
-                    {
-                        goto ProcessRemainingBytesSlow; // Running out of data
-                    }
-                }
-
-                // Assume the 4-byte case, but we need to validate.
-
-                if (BitConverter.IsLittleEndian)
-                {
-                    thisDWord &= 0xC0C0_FFFFu;
-
-                    // After the above modifications earlier in this method, we expect 'thisDWord'
-                    // to have the structure [ 10000000 00000000 00uuzzzz 00010uuu ]. We'll now
-                    // perform two checks to confirm this. The first will verify the
-                    // [ 10000000 00000000 00###### ######## ] structure by taking advantage of two's
-                    // complement representation to perform a single *signed* integer check.
-
-                    if ((int)thisDWord > unchecked((int)0x8000_3FFF))
-                    {
-                        goto Error; // didn't have three trailing bytes
-                    }
-
-                    // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding)
-                    // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding).
-
-                    thisDWord = BitOperations.RotateRight(thisDWord, 8);
-
-                    // Now, thisDWord = [ 00010uuu 10000000 00000000 00uuzzzz ].
-                    // The check is now a simple add / cmp / jcc combo.
-
-                    if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0x1080_0010u, 0x1480_000Fu))
-                    {
-                        goto Error; // overlong or out-of-range
-                    }
-                }
-                else
-                {
-                    thisDWord -= 0x80u;
-
-                    // After the above modifications earlier in this method, we expect 'thisDWord'
-                    // to have the structure [ 00010uuu 00uuzzzz 00yyyyyy 00xxxxxx ]. We'll now
-                    // perform two checks to confirm this. The first will verify the
-                    // [ ######## 00###### 00###### 00###### ] structure.
-
-                    if ((thisDWord & 0x00C0_C0C0u) != 0)
-                    {
-                        goto Error; // didn't have three trailing bytes
-                    }
-
-                    // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding)
-                    // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding).
-                    // This is a simple range check. (We don't care about the low two bytes.)
-
-                    if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0x1010_0000u, 0x140F_FFFFu))
-                    {
-                        goto Error; // overlong or out-of-range
-                    }
-                }
-
-                // Validation of 4-byte case complete.
-
-                pInputBuffer += 4;
-                tempUtf16CodeUnitCountAdjustment -= 2; // 4 UTF-8 bytes -> 2 UTF-16 code units
-                tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar
-
-                continue; // go back to beginning of loop for processing
-            }
-
-            goto ProcessRemainingBytesSlow;
-
-        ProcessInputOfLessThanDWordSize:
-
-            Debug.Assert(inputLength < 4);
-            nuint inputBufferRemainingBytes = (uint)inputLength;
-            goto ProcessSmallBufferCommon;
-
-        ProcessRemainingBytesSlow:
-
-            inputBufferRemainingBytes = (nuint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4;
-
-        ProcessSmallBufferCommon:
-
-            Debug.Assert(inputBufferRemainingBytes < 4);
-            while (inputBufferRemainingBytes > 0)
-            {
-                uint firstByte = pInputBuffer[0];
-
-                if ((byte)firstByte < 0x80u)
-                {
-                    // 1-byte (ASCII) case
-                    pInputBuffer++;
-                    inputBufferRemainingBytes--;
-                    continue;
-                }
-                else if (inputBufferRemainingBytes >= 2)
-                {
-                    uint secondByte = pInputBuffer[1]; // typed as 32-bit since we perform arithmetic (not just comparisons) on this value
-                    if ((byte)firstByte < 0xE0u)
-                    {
-                        // 2-byte case
-                        if ((byte)firstByte >= 0xC2u && Utf8Utility.IsLowByteUtf8ContinuationByte(secondByte))
-                        {
-                            pInputBuffer += 2;
-                            tempUtf16CodeUnitCountAdjustment--; // 2 UTF-8 bytes -> 1 UTF-16 code unit (and 1 scalar)
-                            inputBufferRemainingBytes -= 2;
-                            continue;
-                        }
-                    }
-                    else if (inputBufferRemainingBytes >= 3)
-                    {
-                        if ((byte)firstByte < 0xF0u)
-                        {
-                            if ((byte)firstByte == 0xE0u)
-                            {
-                                if (!UnicodeUtility.IsInRangeInclusive(secondByte, 0xA0u, 0xBFu))
-                                {
-                                    goto Error; // overlong encoding
-                                }
-                            }
-                            else if ((byte)firstByte == 0xEDu)
-                            {
-                                if (!UnicodeUtility.IsInRangeInclusive(secondByte, 0x80u, 0x9Fu))
-                                {
-                                    goto Error; // would be a UTF-16 surrogate code point
-                                }
-                            }
-                            else
-                            {
-                                if (!Utf8Utility.IsLowByteUtf8ContinuationByte(secondByte))
-                                {
-                                    goto Error; // first trailing byte doesn't have proper continuation marker
-                                }
-                            }
-
-                            if (Utf8Utility.IsUtf8ContinuationByte(in pInputBuffer[2]))
-                            {
-                                pInputBuffer += 3;
-                                tempUtf16CodeUnitCountAdjustment -= 2; // 3 UTF-8 bytes -> 2 UTF-16 code units (and 2 scalars)
-                                inputBufferRemainingBytes -= 3;
-                                continue;
-                            }
-                        }
-                    }
-                }
-
-                // Error - no match.
-
-                goto Error;
-            }
-
-            // If we reached this point, we're out of data, and we saw no bad UTF8 sequence.
-
-#if DEBUG
-            // Quick check that for the success case we're going to fulfill our contract of returning &inputBuffer[inputLength].
-            Debug.Assert(pOriginalInputBuffer + originalInputLength == pInputBuffer, "About to return an unexpected value.");
-#endif
-
-        Error:
-
-            // Report back to our caller how far we got before seeing invalid data.
-            // (Also used for normal termination when falling out of the loop above.)
-
-            utf16CodeUnitCountAdjustment = tempUtf16CodeUnitCountAdjustment;
-            scalarCountAdjustment = tempScalarCountAdjustment;
-            return pInputBuffer;
-        }
-    }
-}
-#endif
diff --git a/src/DotNetty.Common/Utilities/AsciiString.NetCore3.cs b/src/DotNetty.Common/Utilities/AsciiString.NetCore3.cs
index 9f522caf7..0b72ab992 100644
--- a/src/DotNetty.Common/Utilities/AsciiString.NetCore3.cs
+++ b/src/DotNetty.Common/Utilities/AsciiString.NetCore3.cs
@@ -72,9 +72,7 @@ private static unsafe bool TryGetBytesFast(char* pChars, int charCount, byte* pB
         [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetBytesCommon
         private static unsafe int GetBytesFast(char* pChars, int charsLength, byte* pBytes, int bytesLength, out int charsConsumed)
         {
-            int bytesWritten = PlatformDependent.Is64BitProcess
-                ? (int)ASCIIUtility64.NarrowUtf16ToAscii(pChars, pBytes, (uint)Math.Min(charsLength, bytesLength))
-                : (int)ASCIIUtility32.NarrowUtf16ToAscii(pChars, pBytes, (uint)Math.Min(charsLength, bytesLength));
+            int bytesWritten = (int)ASCIIUtility.NarrowUtf16ToAscii(pChars, pBytes, (uint)Math.Min(charsLength, bytesLength));
 
             charsConsumed = bytesWritten;
             return bytesWritten;

From f6948235ce31c237c571d3138c66a4466892f9ee Mon Sep 17 00:00:00 2001
From: cuteant <cuteant@outlook.com>
Date: Thu, 24 Jun 2021 00:52:24 +0800
Subject: [PATCH 2/5] Align with dotnet/runtime/CoreLib

---
 src/DotNetty.Buffers/ByteBufferUtil.Utf8.cs   |    2 +-
 src/DotNetty.Buffers/DotNetty.Buffers.csproj  |    2 +-
 .../Writer/ByteBufferWriter.Binary.Helper.cs  |   42 +-
 .../Internal/ASCIIUtility.Helpers.cs          |   17 +
 .../Internal/ASCIIUtility.Net.cs              | 1307 ++++++++++++++
 .../Internal/ASCIIUtility.NetCore3.cs         | 1092 ++++++++++++
 src/DotNetty.Common/Internal/ASCIIUtility.cs  | 1256 ++------------
 .../Internal/TextEncodings.Utf16.NetCore3.cs  |   92 +
 .../Internal/TextEncodings.Utf8.NetCore3.cs   |   99 ++
 src/DotNetty.Common/Internal/TextEncodings.cs |    6 +-
 src/DotNetty.Common/Internal/UnicodeDebug.cs  |   25 +-
 .../Internal/UnicodeUtility.cs                |   10 +-
 .../Internal/Utf16Utility.Validation.Net.cs   |  508 ++++++
 ...cs => Utf16Utility.Validation.NetCore3.cs} |    6 +-
 src/DotNetty.Common/Internal/Utf16Utility.cs  |   55 +
 .../Internal/Utf8Utility.Helpers.cs           |   86 +-
 .../Internal/Utf8Utility.Transcoding.Net.cs   | 1510 +++++++++++++++++
 ...cs => Utf8Utility.Transcoding.NetCore3.cs} |  108 +-
 .../Internal/Utf8Utility.Validation.Net.cs    |   32 +
 .../Internal/Utf8Utility.Validation.cs        |   83 +-
 .../Internal/Utf8Utility.WhiteSpace.cs        |  132 ++
 src/DotNetty.Common/Internal/Utf8Utility.cs   |   59 -
 .../DotNetty.Common.Tests.csproj              |    5 +-
 .../Internal/CoreLib/ASCIIUtilityTests.cs     |  419 +++++
 .../CoreLib/BoundedMemory.Creation.cs         |   95 ++
 .../Internal/CoreLib/BoundedMemory.Unix.cs    |   50 +
 .../Internal/CoreLib/BoundedMemory.Windows.cs |  335 ++++
 .../Internal/CoreLib/BoundedMemory.cs         |   53 +
 .../Internal/CoreLib/PoisonPagePlacement.cs   |   28 +
 .../Utf16UtilityTests.ValidateChars.cs        |  267 +++
 .../Internal/CoreLib/Utf8Tests.cs             |  799 +++++++++
 .../CoreLib/Utf8UtilityTests.ValidateBytes.cs |  396 +++++
 32 files changed, 7621 insertions(+), 1355 deletions(-)
 create mode 100644 src/DotNetty.Common/Internal/ASCIIUtility.Net.cs
 create mode 100644 src/DotNetty.Common/Internal/ASCIIUtility.NetCore3.cs
 create mode 100644 src/DotNetty.Common/Internal/Utf16Utility.Validation.Net.cs
 rename src/DotNetty.Common/Internal/{Utf16Utility.Validation.cs => Utf16Utility.Validation.NetCore3.cs} (99%)
 create mode 100644 src/DotNetty.Common/Internal/Utf8Utility.Transcoding.Net.cs
 rename src/DotNetty.Common/Internal/{Utf8Utility.Transcoding.cs => Utf8Utility.Transcoding.NetCore3.cs} (93%)
 create mode 100644 src/DotNetty.Common/Internal/Utf8Utility.Validation.Net.cs
 create mode 100644 src/DotNetty.Common/Internal/Utf8Utility.WhiteSpace.cs
 create mode 100644 test/DotNetty.Common.Tests/Internal/CoreLib/ASCIIUtilityTests.cs
 create mode 100644 test/DotNetty.Common.Tests/Internal/CoreLib/BoundedMemory.Creation.cs
 create mode 100644 test/DotNetty.Common.Tests/Internal/CoreLib/BoundedMemory.Unix.cs
 create mode 100644 test/DotNetty.Common.Tests/Internal/CoreLib/BoundedMemory.Windows.cs
 create mode 100644 test/DotNetty.Common.Tests/Internal/CoreLib/BoundedMemory.cs
 create mode 100644 test/DotNetty.Common.Tests/Internal/CoreLib/PoisonPagePlacement.cs
 create mode 100644 test/DotNetty.Common.Tests/Internal/CoreLib/Utf16UtilityTests.ValidateChars.cs
 create mode 100644 test/DotNetty.Common.Tests/Internal/CoreLib/Utf8Tests.cs
 create mode 100644 test/DotNetty.Common.Tests/Internal/CoreLib/Utf8UtilityTests.ValidateBytes.cs

diff --git a/src/DotNetty.Buffers/ByteBufferUtil.Utf8.cs b/src/DotNetty.Buffers/ByteBufferUtil.Utf8.cs
index 5e49b34a9..dcecd65da 100644
--- a/src/DotNetty.Buffers/ByteBufferUtil.Utf8.cs
+++ b/src/DotNetty.Buffers/ByteBufferUtil.Utf8.cs
@@ -414,7 +414,7 @@ static bool IsUtf8(IByteBuffer buf, int index, int length)
             var utf8Span = buf.GetReadableSpan(index, length);
             ref byte utf8Source = ref MemoryMarshal.GetReference(utf8Span);
 
-            IntPtr offset = (IntPtr)0; // Use IntPtr for arithmetic to avoid unnecessary 64->32->64 truncations
+            nint offset = 0; // Use IntPtr for arithmetic to avoid unnecessary 64->32->64 truncations
             uint uLength = (uint)length;
 
             while ((uint)index < uLength)
diff --git a/src/DotNetty.Buffers/DotNetty.Buffers.csproj b/src/DotNetty.Buffers/DotNetty.Buffers.csproj
index 043649d35..c1262b48e 100644
--- a/src/DotNetty.Buffers/DotNetty.Buffers.csproj
+++ b/src/DotNetty.Buffers/DotNetty.Buffers.csproj
@@ -2,7 +2,7 @@
   <Import Project="..\nuget.props" />
   
   <PropertyGroup>
-    <TargetFrameworks>netcoreapp2.1;netstandard2.1;$(StandardTfms)</TargetFrameworks>
+    <TargetFrameworks>net5.0;netcoreapp2.1;netstandard2.1;$(StandardTfms)</TargetFrameworks>
     <RootNamespace>DotNetty.Buffers</RootNamespace>
     <AssemblyName>SpanNetty.Buffers</AssemblyName>
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
diff --git a/src/DotNetty.Buffers/Writer/ByteBufferWriter.Binary.Helper.cs b/src/DotNetty.Buffers/Writer/ByteBufferWriter.Binary.Helper.cs
index efae526ab..7bff2dabd 100644
--- a/src/DotNetty.Buffers/Writer/ByteBufferWriter.Binary.Helper.cs
+++ b/src/DotNetty.Buffers/Writer/ByteBufferWriter.Binary.Helper.cs
@@ -42,7 +42,7 @@ private unsafe static void SetMedium(ref byte start, int value)
             //    UnsafeByteBufferUtil.SetMedium(bytes, value);
             //}
             uint unsignedValue = (uint)value;
-            IntPtr offset = (IntPtr)0;
+            nint offset = 0;
             Unsafe.AddByteOffset(ref start, offset) = (byte)(unsignedValue >> 16);
             Unsafe.AddByteOffset(ref start, offset + 1) = (byte)(unsignedValue >> 8);
             Unsafe.AddByteOffset(ref start, offset + 2) = (byte)unsignedValue;
@@ -52,7 +52,7 @@ private unsafe static void SetMedium(ref byte start, int value)
         private unsafe static void SetMediumLE(ref byte start, int value)
         {
             uint unsignedValue = (uint)value;
-            IntPtr offset = (IntPtr)0;
+            nint offset = 0;
             Unsafe.AddByteOffset(ref start, offset) = (byte)unsignedValue;
             Unsafe.AddByteOffset(ref start, offset + 1) = (byte)(unsignedValue >> 8);
             Unsafe.AddByteOffset(ref start, offset + 2) = (byte)(unsignedValue >> 16);
@@ -66,8 +66,8 @@ private unsafe static void SetDecimal(ref byte start, decimal value)
             uint mid = (uint)bits[1];
             uint high = (uint)bits[2];
             uint flags = (uint)bits[3];
-            IntPtr offset = (IntPtr)0;
 
+            nint offset = 0;
             Unsafe.AddByteOffset(ref start, offset) = (byte)(lo >> 24); // lo
             Unsafe.AddByteOffset(ref start, offset + 1) = (byte)(lo >> 16);
             Unsafe.AddByteOffset(ref start, offset + 2) = (byte)(lo >> 8);
@@ -76,14 +76,15 @@ private unsafe static void SetDecimal(ref byte start, decimal value)
             Unsafe.AddByteOffset(ref start, offset + 5) = (byte)(mid >> 16);
             Unsafe.AddByteOffset(ref start, offset + 6) = (byte)(mid >> 8);
             Unsafe.AddByteOffset(ref start, offset + 7) = (byte)mid;
-            Unsafe.AddByteOffset(ref start, offset + 8) = (byte)(high >> 24); // high
-            Unsafe.AddByteOffset(ref start, offset + 9) = (byte)(high >> 16);
-            Unsafe.AddByteOffset(ref start, offset + 10) = (byte)(high >> 8);
-            Unsafe.AddByteOffset(ref start, offset + 11) = (byte)high;
-            Unsafe.AddByteOffset(ref start, offset + 12) = (byte)(flags >> 24); // flags
-            Unsafe.AddByteOffset(ref start, offset + 13) = (byte)(flags >> 16);
-            Unsafe.AddByteOffset(ref start, offset + 14) = (byte)(flags >> 8);
-            Unsafe.AddByteOffset(ref start, offset + 15) = (byte)flags;
+            offset += 8;
+            Unsafe.AddByteOffset(ref start, offset) = (byte)(high >> 24); // high
+            Unsafe.AddByteOffset(ref start, offset + 1) = (byte)(high >> 16);
+            Unsafe.AddByteOffset(ref start, offset + 2) = (byte)(high >> 8);
+            Unsafe.AddByteOffset(ref start, offset + 3) = (byte)high;
+            Unsafe.AddByteOffset(ref start, offset + 4) = (byte)(flags >> 24); // flags
+            Unsafe.AddByteOffset(ref start, offset + 5) = (byte)(flags >> 16);
+            Unsafe.AddByteOffset(ref start, offset + 6) = (byte)(flags >> 8);
+            Unsafe.AddByteOffset(ref start, offset + 7) = (byte)flags;
         }
 
         [MethodImpl(InlineMethod.AggressiveInlining)]
@@ -94,8 +95,8 @@ private unsafe static void SetDecimalLE(ref byte start, decimal value)
             uint mid = (uint)bits[1];
             uint high = (uint)bits[2];
             uint flags = (uint)bits[3];
-            IntPtr offset = (IntPtr)0;
 
+            nint offset = 0;
             Unsafe.AddByteOffset(ref start, offset) = (byte)lo;
             Unsafe.AddByteOffset(ref start, offset + 1) = (byte)(lo >> 8);
             Unsafe.AddByteOffset(ref start, offset + 2) = (byte)(lo >> 16);
@@ -104,14 +105,15 @@ private unsafe static void SetDecimalLE(ref byte start, decimal value)
             Unsafe.AddByteOffset(ref start, offset + 5) = (byte)(mid >> 8);
             Unsafe.AddByteOffset(ref start, offset + 6) = (byte)(mid >> 16);
             Unsafe.AddByteOffset(ref start, offset + 7) = (byte)(mid >> 24); // mid
-            Unsafe.AddByteOffset(ref start, offset + 8) = (byte)high;
-            Unsafe.AddByteOffset(ref start, offset + 9) = (byte)(high >> 8);
-            Unsafe.AddByteOffset(ref start, offset + 10) = (byte)(high >> 16);
-            Unsafe.AddByteOffset(ref start, offset + 11) = (byte)(high >> 24); // high
-            Unsafe.AddByteOffset(ref start, offset + 12) = (byte)flags;
-            Unsafe.AddByteOffset(ref start, offset + 13) = (byte)(flags >> 8);
-            Unsafe.AddByteOffset(ref start, offset + 14) = (byte)(flags >> 16);
-            Unsafe.AddByteOffset(ref start, offset + 15) = (byte)(flags >> 24); // flags
+            offset += 8;
+            Unsafe.AddByteOffset(ref start, offset) = (byte)high;
+            Unsafe.AddByteOffset(ref start, offset + 1) = (byte)(high >> 8);
+            Unsafe.AddByteOffset(ref start, offset + 2) = (byte)(high >> 16);
+            Unsafe.AddByteOffset(ref start, offset + 3) = (byte)(high >> 24); // high
+            Unsafe.AddByteOffset(ref start, offset + 4) = (byte)flags;
+            Unsafe.AddByteOffset(ref start, offset + 5) = (byte)(flags >> 8);
+            Unsafe.AddByteOffset(ref start, offset + 6) = (byte)(flags >> 16);
+            Unsafe.AddByteOffset(ref start, offset + 7) = (byte)(flags >> 24); // flags
         }
 
         /// <summary>Writes a 32-bit integer in a compressed format.</summary>
diff --git a/src/DotNetty.Common/Internal/ASCIIUtility.Helpers.cs b/src/DotNetty.Common/Internal/ASCIIUtility.Helpers.cs
index 189d4a4d8..f67534b52 100644
--- a/src/DotNetty.Common/Internal/ASCIIUtility.Helpers.cs
+++ b/src/DotNetty.Common/Internal/ASCIIUtility.Helpers.cs
@@ -9,7 +9,9 @@
 using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.CompilerServices;
+#if NETCOREAPP3_1
 using System.Runtime.Intrinsics.X86;
+#endif
 
 namespace DotNetty.Common.Internal
 {
@@ -48,6 +50,7 @@ internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiDat
         {
             Debug.Assert(!AllBytesInUInt32AreAscii(value), "Caller shouldn't provide an all-ASCII value.");
 
+#if NETCOREAPP3_1
             // Use BMI1 directly rather than going through BitOperations. We only see a perf gain here
             // if we're able to emit a real tzcnt instruction; the software fallback used by BitOperations
             // is too slow for our purposes since we can provide our own faster, specialized software fallback.
@@ -84,8 +87,22 @@ internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiDat
 
                 return numAsciiBytes;
             }
+#else
+            if (BitConverter.IsLittleEndian)
+            {
+                return (uint)BitOperations.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3;
+            }
+#endif
             else
             {
+#if NET
+                // Couldn't use tzcnt, use specialized software fallback.
+                // The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending
+                // on whether all processed bytes were ASCII. Then we accumulate all of the
+                // results to calculate how many consecutive ASCII bytes are present.
+
+                value = ~value;
+#endif
                 // BinaryPrimitives.ReverseEndianness is only implemented as an intrinsic on
                 // little-endian platforms, so using it in this big-endian path would be too
                 // expensive. Instead we'll just change how we perform the shifts.
diff --git a/src/DotNetty.Common/Internal/ASCIIUtility.Net.cs b/src/DotNetty.Common/Internal/ASCIIUtility.Net.cs
new file mode 100644
index 000000000..7c73bd34a
--- /dev/null
+++ b/src/DotNetty.Common/Internal/ASCIIUtility.Net.cs
@@ -0,0 +1,1307 @@
+﻿// borrowed from https://github.com/dotnet/corefx/blob/release/3.1/src/Common/src/CoreLib/System/Text/ASCIIUtility.cs
+
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#if NET
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.X86;
+
+namespace DotNetty.Common.Internal
+{
+    partial class ASCIIUtility
+    {
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static int GetIndexOfFirstNonAsciiByteInLane_AdvSimd(Vector128<byte> value, Vector128<byte> bitmask)
+        {
+            if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian)
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+            // extractedBits[i] = (value[i] >> 7) & (1 << (12 * (i % 2)));
+            Vector128<byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte();
+            Vector128<byte> extractedBits = AdvSimd.And(mostSignificantBitIsSet, bitmask);
+
+            // collapse mask to lower bits
+            extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
+            ulong mask = extractedBits.AsUInt64().ToScalar();
+
+            // calculate the index
+            int index = BitOperations.TrailingZeroCount(mask) >> 2;
+            Debug.Assert((mask != 0) ? index < 16 : index >= 16);
+            return index;
+        }
+
+        /// <summary>
+        /// Returns the index in <paramref name="pBuffer"/> where the first non-ASCII byte is found.
+        /// Returns <paramref name="bufferLength"/> if the buffer is empty or all-ASCII.
+        /// </summary>
+        /// <returns>An ASCII byte is defined as 0x00 - 0x7F, inclusive.</returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bufferLength)
+        {
+            // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
+            // code below. This has two benefits: (a) we can take advantage of specific instructions like
+            // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
+            // this method is running.
+
+            return (Sse2.IsSupported || AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)
+                ? GetIndexOfFirstNonAsciiByte_Intrinsified(pBuffer, bufferLength)
+                : GetIndexOfFirstNonAsciiByte_Default(pBuffer, bufferLength);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool ContainsNonAsciiByte_Sse2(uint sseMask)
+        {
+            Debug.Assert(sseMask != uint.MaxValue);
+            Debug.Assert(Sse2.IsSupported);
+            return sseMask != 0;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool ContainsNonAsciiByte_AdvSimd(uint advSimdIndex)
+        {
+            Debug.Assert(advSimdIndex != uint.MaxValue);
+            Debug.Assert(AdvSimd.IsSupported);
+            return advSimdIndex < 16;
+        }
+
+        private static unsafe nuint GetIndexOfFirstNonAsciiByte_Intrinsified(byte* pBuffer, nuint bufferLength)
+        {
+            // JIT turns the below into constants
+
+            uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
+            nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
+
+            Debug.Assert(Sse2.IsSupported || AdvSimd.Arm64.IsSupported, "Sse2 or AdvSimd64 required.");
+            Debug.Assert(BitConverter.IsLittleEndian, "This SSE2/Arm64 implementation assumes little-endian.");
+
+            Vector128<byte> bitmask = BitConverter.IsLittleEndian ?
+                Vector128.Create((ushort)0x1001).AsByte() :
+                Vector128.Create((ushort)0x0110).AsByte();
+
+            uint currentSseMask = uint.MaxValue, secondSseMask = uint.MaxValue;
+            uint currentAdvSimdIndex = uint.MaxValue, secondAdvSimdIndex = uint.MaxValue;
+            byte* pOriginalBuffer = pBuffer;
+
+            // This method is written such that control generally flows top-to-bottom, avoiding
+            // jumps as much as possible in the optimistic case of a large enough buffer and
+            // "all ASCII". If we see non-ASCII data, we jump out of the hot paths to targets
+            // after all the main logic.
+
+            if (bufferLength < SizeOfVector128)
+            {
+                goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
+            }
+
+            // Read the first vector unaligned.
+
+            if (Sse2.IsSupported)
+            {
+                currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
+                if (ContainsNonAsciiByte_Sse2(currentSseMask))
+                {
+                    goto FoundNonAsciiDataInCurrentChunk;
+                }
+            }
+            else if (AdvSimd.Arm64.IsSupported)
+            {
+                currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask); // unaligned load
+                if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex))
+                {
+                    goto FoundNonAsciiDataInCurrentChunk;
+                }
+            }
+            else
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+            // If we have less than 32 bytes to process, just go straight to the final unaligned
+            // read. There's no need to mess with the loop logic in the middle of this method.
+
+            if (bufferLength < 2 * SizeOfVector128)
+            {
+                goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
+            }
+
+            // Now adjust the read pointer so that future reads are aligned.
+
+            pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector128) & ~(nuint)MaskOfAllBitsInVector128);
+
+#if DEBUG
+            long numBytesRead = pBuffer - pOriginalBuffer;
+            Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector128, "We should've made forward progress of at least one byte.");
+            Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
+#endif
+
+            // Adjust the remaining length to account for what we just read.
+
+            bufferLength += (nuint)pOriginalBuffer;
+            bufferLength -= (nuint)pBuffer;
+
+            // The buffer is now properly aligned.
+            // Read 2 vectors at a time if possible.
+
+            if (bufferLength >= 2 * SizeOfVector128)
+            {
+                byte* pFinalVectorReadPos = (byte*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128);
+
+                // After this point, we no longer need to update the bufferLength value.
+
+                do
+                {
+                    if (Sse2.IsSupported)
+                    {
+                        Vector128<byte> firstVector = Sse2.LoadAlignedVector128(pBuffer);
+                        Vector128<byte> secondVector = Sse2.LoadAlignedVector128(pBuffer + SizeOfVector128);
+
+                        currentSseMask = (uint)Sse2.MoveMask(firstVector);
+                        secondSseMask = (uint)Sse2.MoveMask(secondVector);
+                        if (ContainsNonAsciiByte_Sse2(currentSseMask | secondSseMask))
+                        {
+                            goto FoundNonAsciiDataInInnerLoop;
+                        }
+                    }
+                    else if (AdvSimd.Arm64.IsSupported)
+                    {
+                        Vector128<byte> firstVector = AdvSimd.LoadVector128(pBuffer);
+                        Vector128<byte> secondVector = AdvSimd.LoadVector128(pBuffer + SizeOfVector128);
+
+                        currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(firstVector, bitmask);
+                        secondAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(secondVector, bitmask);
+                        if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex) || ContainsNonAsciiByte_AdvSimd(secondAdvSimdIndex))
+                        {
+                            goto FoundNonAsciiDataInInnerLoop;
+                        }
+                    }
+                    else
+                    {
+                        throw new PlatformNotSupportedException();
+                    }
+
+                    pBuffer += 2 * SizeOfVector128;
+                } while (pBuffer <= pFinalVectorReadPos);
+            }
+
+            // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
+            // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
+            // But we _can_ rely on it to tell us how much remaining data must be drained by looking
+            // at what bits of it are set. This works because had we updated it within the loop above,
+            // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
+            // bits which are less significant than those that the addition would've acted on.
+
+            // If there is fewer than one vector length remaining, skip the next aligned read.
+
+            if ((bufferLength & SizeOfVector128) == 0)
+            {
+                goto DoFinalUnalignedVectorRead;
+            }
+
+            // At least one full vector's worth of data remains, so we can safely read it.
+            // Remember, at this point pBuffer is still aligned.
+
+            if (Sse2.IsSupported)
+            {
+                currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadAlignedVector128(pBuffer));
+                if (ContainsNonAsciiByte_Sse2(currentSseMask))
+                {
+                    goto FoundNonAsciiDataInCurrentChunk;
+                }
+            }
+            else if (AdvSimd.Arm64.IsSupported)
+            {
+                currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask);
+                if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex))
+                {
+                    goto FoundNonAsciiDataInCurrentChunk;
+                }
+            }
+            else
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+        IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
+
+            pBuffer += SizeOfVector128;
+
+        DoFinalUnalignedVectorRead:
+
+            if (((byte)bufferLength & MaskOfAllBitsInVector128) != 0)
+            {
+                // Perform an unaligned read of the last vector.
+                // We need to adjust the pointer because we're re-reading data.
+
+                pBuffer += (bufferLength & MaskOfAllBitsInVector128) - SizeOfVector128;
+
+                if (Sse2.IsSupported)
+                {
+                    currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
+                    if (ContainsNonAsciiByte_Sse2(currentSseMask))
+                    {
+                        goto FoundNonAsciiDataInCurrentChunk;
+                    }
+
+                }
+                else if (AdvSimd.Arm64.IsSupported)
+                {
+                    currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask); // unaligned load
+                    if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex))
+                    {
+                        goto FoundNonAsciiDataInCurrentChunk;
+                    }
+
+                }
+                else
+                {
+                    throw new PlatformNotSupportedException();
+                }
+
+                pBuffer += SizeOfVector128;
+            }
+
+        Finish:
+            return (nuint)pBuffer - (nuint)pOriginalBuffer; // and we're done!
+
+        FoundNonAsciiDataInInnerLoop:
+
+            // If the current (first) mask isn't the mask that contains non-ASCII data, then it must
+            // instead be the second mask. If so, skip the entire first mask and drain ASCII bytes
+            // from the second mask.
+
+            if (Sse2.IsSupported)
+            {
+                if (!ContainsNonAsciiByte_Sse2(currentSseMask))
+                {
+                    pBuffer += SizeOfVector128;
+                    currentSseMask = secondSseMask;
+                }
+            }
+            else if (AdvSimd.IsSupported)
+            {
+                if (!ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex))
+                {
+                    pBuffer += SizeOfVector128;
+                    currentAdvSimdIndex = secondAdvSimdIndex;
+                }
+            }
+            else
+            {
+                throw new PlatformNotSupportedException();
+            }
+        FoundNonAsciiDataInCurrentChunk:
+
+
+            if (Sse2.IsSupported)
+            {
+                // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
+                // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
+                // available, we'll fall back to a normal loop.
+                Debug.Assert(ContainsNonAsciiByte_Sse2(currentSseMask), "Shouldn't be here unless we see non-ASCII data.");
+                pBuffer += (uint)BitOperations.TrailingZeroCount(currentSseMask);
+            }
+            else if (AdvSimd.Arm64.IsSupported)
+            {
+                Debug.Assert(ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex), "Shouldn't be here unless we see non-ASCII data.");
+                pBuffer += currentAdvSimdIndex;
+            }
+            else
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+            goto Finish;
+
+        FoundNonAsciiDataInCurrentDWord:
+
+            uint currentDWord;
+            Debug.Assert(!AllBytesInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
+            pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentDWord);
+
+            goto Finish;
+
+        InputBufferLessThanOneVectorInLength:
+
+            // These code paths get hit if the original input length was less than one vector in size.
+            // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
+            // directly. Note that all of these reads are unaligned.
+
+            Debug.Assert(bufferLength < SizeOfVector128);
+
+            // QWORD drain
+
+            if ((bufferLength & 8) != 0)
+            {
+                if (UIntPtr.Size == sizeof(ulong))
+                {
+                    // If we can use 64-bit tzcnt to count the number of leading ASCII bytes, prefer it.
+
+                    ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
+                    if (!AllBytesInUInt64AreAscii(candidateUInt64))
+                    {
+                        // Clear everything but the high bit of each byte, then tzcnt.
+                        // Remember to divide by 8 at the end to convert bit count to byte count.
+
+                        candidateUInt64 &= UInt64HighBitsOnlyMask;
+                        pBuffer += (nuint)(BitOperations.TrailingZeroCount(candidateUInt64) >> 3);
+                        goto Finish;
+                    }
+                }
+                else
+                {
+                    // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.
+
+                    currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
+                    uint nextDWord = Unsafe.ReadUnaligned<uint>(pBuffer + 4);
+
+                    if (!AllBytesInUInt32AreAscii(currentDWord | nextDWord))
+                    {
+                        // At least one of the values wasn't all-ASCII.
+                        // We need to figure out which one it was and stick it in the currentMask local.
+
+                        if (AllBytesInUInt32AreAscii(currentDWord))
+                        {
+                            currentDWord = nextDWord; // this one is the culprit
+                            pBuffer += 4;
+                        }
+
+                        goto FoundNonAsciiDataInCurrentDWord;
+                    }
+                }
+
+                pBuffer += 8; // successfully consumed 8 ASCII bytes
+            }
+
+            // DWORD drain
+
+            if ((bufferLength & 4) != 0)
+            {
+                currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
+
+                if (!AllBytesInUInt32AreAscii(currentDWord))
+                {
+                    goto FoundNonAsciiDataInCurrentDWord;
+                }
+
+                pBuffer += 4; // successfully consumed 4 ASCII bytes
+            }
+
+            // WORD drain
+            // (We movzx to a DWORD for ease of manipulation.)
+
+            if ((bufferLength & 2) != 0)
+            {
+                currentDWord = Unsafe.ReadUnaligned<ushort>(pBuffer);
+
+                if (!AllBytesInUInt32AreAscii(currentDWord))
+                {
+                    // We only care about the 0x0080 bit of the value. If it's not set, then we
+                    // increment currentOffset by 1. If it's set, we don't increment it at all.
+
+                    pBuffer += (nuint)((nint)(sbyte)currentDWord >> 7) + 1;
+                    goto Finish;
+                }
+
+                pBuffer += 2; // successfully consumed 2 ASCII bytes
+            }
+
+            // BYTE drain
+
+            if ((bufferLength & 1) != 0)
+            {
+                // sbyte has non-negative value if byte is ASCII.
+
+                if (*(sbyte*)(pBuffer) >= 0)
+                {
+                    pBuffer++; // successfully consumed a single byte
+                }
+            }
+
+            goto Finish;
+        }
+
+        private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuint bufferLength /* in chars */)
+        {
+            // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
+            // will be elided by JIT once we determine which specific ISAs we support.
+
+            // Quick check for empty inputs.
+
+            if (bufferLength == 0)
+            {
+                return 0;
+            }
+
+            // JIT turns the below into constants
+
+            uint SizeOfVector128InBytes = (uint)Unsafe.SizeOf<Vector128<byte>>();
+            uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char);
+
+            Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
+            Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
+
+            Vector128<ushort> firstVector, secondVector;
+            uint currentMask;
+            char* pOriginalBuffer = pBuffer;
+
+            if (bufferLength < SizeOfVector128InChars)
+            {
+                goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
+            }
+
+            // This method is written such that control generally flows top-to-bottom, avoiding
+            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
+            // data, we jump out of the hot paths to targets at the end of the method.
+
+            Vector128<ushort> asciiMaskForTestZ = Vector128.Create((ushort)0xFF80); // used for PTEST on supported hardware
+            Vector128<ushort> asciiMaskForAddSaturate = Vector128.Create((ushort)0x7F80); // used for PADDUSW
+            const uint NonAsciiDataSeenMask = 0b_1010_1010_1010_1010; // used for determining whether 'currentMask' contains non-ASCII data
+
+//#if SYSTEM_PRIVATE_CORELIB
+            Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
+//#endif
+
+            // Read the first vector unaligned.
+
+            firstVector = Sse2.LoadVector128((ushort*)pBuffer); // unaligned load
+
+            // The operation below forces the 0x8000 bit of each WORD to be set iff the WORD element
+            // has value >= 0x0800 (non-ASCII). Then we'll treat the vector as a BYTE vector in order
+            // to extract the mask. Reminder: the 0x0080 bit of each WORD should be ignored.
+
+            currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
+
+            if ((currentMask & NonAsciiDataSeenMask) != 0)
+            {
+                goto FoundNonAsciiDataInCurrentMask;
+            }
+
+            // If we have less than 32 bytes to process, just go straight to the final unaligned
+            // read. There's no need to mess with the loop logic in the middle of this method.
+
+            // Adjust the remaining length to account for what we just read.
+            // For the remainder of this code path, bufferLength will be in bytes, not chars.
+
+            bufferLength <<= 1; // chars to bytes
+
+            if (bufferLength < 2 * SizeOfVector128InBytes)
+            {
+                goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
+            }
+
+            // Now adjust the read pointer so that future reads are aligned.
+
+            pBuffer = (char*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1));
+
+#if DEBUG
+            long numCharsRead = pBuffer - pOriginalBuffer;
+            Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector128InChars, "We should've made forward progress of at least one char.");
+            Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
+#endif
+
+            // Adjust remaining buffer length.
+
+            bufferLength += (nuint)pOriginalBuffer;
+            bufferLength -= (nuint)pBuffer;
+
+            // The buffer is now properly aligned.
+            // Read 2 vectors at a time if possible.
+
+            if (bufferLength >= 2 * SizeOfVector128InBytes)
+            {
+                char* pFinalVectorReadPos = (char*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes);
+
+                // After this point, we no longer need to update the bufferLength value.
+
+                do
+                {
+                    firstVector = Sse2.LoadAlignedVector128((ushort*)pBuffer);
+                    secondVector = Sse2.LoadAlignedVector128((ushort*)pBuffer + SizeOfVector128InChars);
+                    Vector128<ushort> combinedVector = Sse2.Or(firstVector, secondVector);
+
+                    if (Sse41.IsSupported)
+                    {
+                        // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
+                        // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
+                        if (!Sse41.TestZ(combinedVector, asciiMaskForTestZ))
+                        {
+                            goto FoundNonAsciiDataInFirstOrSecondVector;
+                        }
+                    }
+                    else
+                    {
+                        // See comment earlier in the method for an explanation of how the below logic works.
+                        currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(combinedVector, asciiMaskForAddSaturate).AsByte());
+                        if ((currentMask & NonAsciiDataSeenMask) != 0)
+                        {
+                            goto FoundNonAsciiDataInFirstOrSecondVector;
+                        }
+                    }
+
+                    pBuffer += 2 * SizeOfVector128InChars;
+                } while (pBuffer <= pFinalVectorReadPos);
+            }
+
+            // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
+            // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
+            // But we _can_ rely on it to tell us how much remaining data must be drained by looking
+            // at what bits of it are set. This works because had we updated it within the loop above,
+            // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
+            // bits which are less significant than those that the addition would've acted on.
+
+            // If there is fewer than one vector length remaining, skip the next aligned read.
+            // Remember, at this point bufferLength is measured in bytes, not chars.
+
+            if ((bufferLength & SizeOfVector128InBytes) == 0)
+            {
+                goto DoFinalUnalignedVectorRead;
+            }
+
+            // At least one full vector's worth of data remains, so we can safely read it.
+            // Remember, at this point pBuffer is still aligned.
+
+            firstVector = Sse2.LoadAlignedVector128((ushort*)pBuffer);
+
+            if (Sse41.IsSupported)
+            {
+                // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
+                // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
+                if (!Sse41.TestZ(firstVector, asciiMaskForTestZ))
+                {
+                    goto FoundNonAsciiDataInFirstVector;
+                }
+            }
+            else
+            {
+                // See comment earlier in the method for an explanation of how the below logic works.
+                currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
+                if ((currentMask & NonAsciiDataSeenMask) != 0)
+                {
+                    goto FoundNonAsciiDataInCurrentMask;
+                }
+            }
+
+        IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
+
+            pBuffer += SizeOfVector128InChars;
+
+        DoFinalUnalignedVectorRead:
+
+            if (((byte)bufferLength & (SizeOfVector128InBytes - 1)) != 0)
+            {
+                // Perform an unaligned read of the last vector.
+                // We need to adjust the pointer because we're re-reading data.
+
+                pBuffer = (char*)((byte*)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes);
+                firstVector = Sse2.LoadVector128((ushort*)pBuffer); // unaligned load
+
+                if (Sse41.IsSupported)
+                {
+                    // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
+                    // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
+                    if (!Sse41.TestZ(firstVector, asciiMaskForTestZ))
+                    {
+                        goto FoundNonAsciiDataInFirstVector;
+                    }
+                }
+                else
+                {
+                    // See comment earlier in the method for an explanation of how the below logic works.
+                    currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
+                    if ((currentMask & NonAsciiDataSeenMask) != 0)
+                    {
+                        goto FoundNonAsciiDataInCurrentMask;
+                    }
+                }
+
+                pBuffer += SizeOfVector128InChars;
+            }
+
+        Finish:
+
+            Debug.Assert(((nuint)pBuffer - (nuint)pOriginalBuffer) % 2 == 0, "Shouldn't have incremented any pointer by an odd byte count.");
+            return ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char); // and we're done! (remember to adjust for char count)
+
+        FoundNonAsciiDataInFirstOrSecondVector:
+
+            // We don't know if the first or the second vector contains non-ASCII data. Check the first
+            // vector, and if that's all-ASCII then the second vector must be the culprit. Either way
+            // we'll make sure the first vector local is the one that contains the non-ASCII data.
+
+            // See comment earlier in the method for an explanation of how the below logic works.
+            if (Sse41.IsSupported)
+            {
+                if (!Sse41.TestZ(firstVector, asciiMaskForTestZ))
+                {
+                    goto FoundNonAsciiDataInFirstVector;
+                }
+            }
+            else
+            {
+                currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
+                if ((currentMask & NonAsciiDataSeenMask) != 0)
+                {
+                    goto FoundNonAsciiDataInCurrentMask;
+                }
+            }
+
+            // Wasn't the first vector; must be the second.
+
+            pBuffer += SizeOfVector128InChars;
+            firstVector = secondVector;
+
+        FoundNonAsciiDataInFirstVector:
+
+            // See comment earlier in the method for an explanation of how the below logic works.
+            currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
+
+        FoundNonAsciiDataInCurrentMask:
+
+            // See comment earlier in the method accounting for the 0x8000 and 0x0080 bits set after the WORD-sized operations.
+
+            currentMask &= NonAsciiDataSeenMask;
+
+            // Now, the mask contains - from the LSB - a 0b00 pair for each ASCII char we saw, and a 0b10 pair for each non-ASCII char.
+            //
+            // (Keep endianness in mind in the below examples.)
+            // A non-ASCII char followed by two ASCII chars is 0b..._00_00_10. (tzcnt = 1)
+            // An ASCII char followed by two non-ASCII chars is 0b..._10_10_00. (tzcnt = 3)
+            // Two ASCII chars followed by a non-ASCII char is 0b..._10_00_00. (tzcnt = 5)
+            //
+            // This means tzcnt = 2 * numLeadingAsciiChars + 1. We can conveniently take advantage of the fact
+            // that the 2x multiplier already matches the char* stride length, then just subtract 1 at the end to
+            // compute the correct final ending pointer value.
+
+            Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
+            pBuffer = (char*)((byte*)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask) - 1);
+
+            goto Finish;
+
+        FoundNonAsciiDataInCurrentDWord:
+
+            uint currentDWord;
+            Debug.Assert(!AllCharsInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
+
+            if (FirstCharInUInt32IsAscii(currentDWord))
+            {
+                pBuffer++; // skip past the ASCII char
+            }
+
+            goto Finish;
+
+        InputBufferLessThanOneVectorInLength:
+
+            // These code paths get hit if the original input length was less than one vector in size.
+            // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
+            // directly. Note that all of these reads are unaligned.
+
+            // Reminder: If this code path is hit, bufferLength is still a char count, not a byte count.
+            // We skipped the code path that multiplied the count by sizeof(char).
+
+            Debug.Assert(bufferLength < SizeOfVector128InChars);
+
+            // QWORD drain
+
+            if ((bufferLength & 4) != 0)
+            {
+                if (UIntPtr.Size == sizeof(ulong))
+                {
+                    // If we can use 64-bit tzcnt to count the number of leading ASCII chars, prefer it.
+
+                    ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
+                    if (!AllCharsInUInt64AreAscii(candidateUInt64))
+                    {
+                        // Clear the low 7 bits (the ASCII bits) of each char, then tzcnt.
+                        // Remember to divide by 8 at the end to convert bit count to byte count,
+                        // then the & ~1 at the end to treat a match in the high byte of
+                        // any char the same as a match in the low byte of that same char.
+
+                        candidateUInt64 &= 0xFF80FF80_FF80FF80ul;
+                        pBuffer = (char*)((byte*)pBuffer + ((nuint)(BitOperations.TrailingZeroCount(candidateUInt64) >> 3) & ~(nuint)1));
+                        goto Finish;
+                    }
+                }
+                else
+                {
+                    // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.
+
+                    currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
+                    uint nextDWord = Unsafe.ReadUnaligned<uint>(pBuffer + 4 / sizeof(char));
+
+                    if (!AllCharsInUInt32AreAscii(currentDWord | nextDWord))
+                    {
+                        // At least one of the values wasn't all-ASCII.
+                        // We need to figure out which one it was and stick it in the currentMask local.
+
+                        if (AllCharsInUInt32AreAscii(currentDWord))
+                        {
+                            currentDWord = nextDWord; // this one is the culprit
+                            pBuffer += 4 / sizeof(char);
+                        }
+
+                        goto FoundNonAsciiDataInCurrentDWord;
+                    }
+                }
+
+                pBuffer += 4; // successfully consumed 4 ASCII chars
+            }
+
+            // DWORD drain
+
+            if ((bufferLength & 2) != 0)
+            {
+                currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
+
+                if (!AllCharsInUInt32AreAscii(currentDWord))
+                {
+                    goto FoundNonAsciiDataInCurrentDWord;
+                }
+
+                pBuffer += 2; // successfully consumed 2 ASCII chars
+            }
+
+            // WORD drain
+            // This is the final drain; there's no need for a BYTE drain since our elemental type is 16-bit char.
+
+            if ((bufferLength & 1) != 0)
+            {
+                if (*pBuffer <= 0x007F)
+                {
+                    pBuffer++; // successfully consumed a single char
+                }
+            }
+
+            goto Finish;
+        }
+
+        private static unsafe nuint NarrowUtf16ToAscii_Sse2(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
+        {
+            // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
+            // will be elided by JIT once we determine which specific ISAs we support.
+
+            // JIT turns the below into constants
+
+            uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
+            nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
+
+            // This method is written such that control generally flows top-to-bottom, avoiding
+            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
+            // data, we jump out of the hot paths to targets at the end of the method.
+
+            Debug.Assert(Sse2.IsSupported);
+            Debug.Assert(BitConverter.IsLittleEndian);
+            Debug.Assert(elementCount >= 2 * SizeOfVector128);
+
+            Vector128<short> asciiMaskForTestZ = Vector128.Create(unchecked((short)0xFF80)); // used for PTEST on supported hardware
+            Vector128<ushort> asciiMaskForAddSaturate = Vector128.Create((ushort)0x7F80); // used for PADDUSW
+            const int NonAsciiDataSeenMask = 0b_1010_1010_1010_1010; // used for determining whether the pmovmskb operation saw non-ASCII chars
+
+            // First, perform an unaligned read of the first part of the input buffer.
+
+            Vector128<short> utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer); // unaligned load
+
+            // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
+            // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
+
+            if (Sse41.IsSupported)
+            {
+                if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForTestZ))
+                {
+                    return 0;
+                }
+            }
+            else
+            {
+                if ((Sse2.MoveMask(Sse2.AddSaturate(utf16VectorFirst.AsUInt16(), asciiMaskForAddSaturate).AsByte()) & NonAsciiDataSeenMask) != 0)
+                {
+                    return 0;
+                }
+            }
+
+            // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
+
+            Vector128<byte> asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
+            Sse2.StoreScalar((ulong*)pAsciiBuffer, asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
+
+            nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far
+
+            // We're going to get the best performance when we have aligned writes, so we'll take the
+            // hit of potentially unaligned reads in order to hit this sweet spot.
+
+            // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote
+            // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote
+            // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In
+            // that case we can immediately back up to the previous aligned boundary and start the main loop.
+            // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at
+            // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump
+            // just past the next aligned boundary address.
+
+            if (((uint)pAsciiBuffer & (SizeOfVector128 / 2)) == 0)
+            {
+                // We need to perform one more partial vector write before we can get the alignment we want.
+
+                utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements); // unaligned load
+
+                // See comments earlier in this method for information about how this works.
+                if (Sse41.IsSupported)
+                {
+                    if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForTestZ))
+                    {
+                        goto Finish;
+                    }
+                }
+                else
+                {
+                    if ((Sse2.MoveMask(Sse2.AddSaturate(utf16VectorFirst.AsUInt16(), asciiMaskForAddSaturate).AsByte()) & NonAsciiDataSeenMask) != 0)
+                    {
+                        goto Finish;
+                    }
+                }
+
+                // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
+                asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
+                Sse2.StoreScalar((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
+            }
+
+            // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
+            // point, then use that as the base offset going forward.
+
+            currentOffsetInElements = SizeOfVector128 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector128);
+            Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector128, "We wrote at least 1 byte but no more than a whole vector.");
+
+            Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer.");
+            Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector128, "We should be able to run at least one whole vector.");
+
+            nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;
+            do
+            {
+                // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector.
+
+                utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements); // unaligned load
+                Vector128<short> utf16VectorSecond = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements + SizeOfVector128 / sizeof(short)); // unaligned load
+                Vector128<short> combinedVector = Sse2.Or(utf16VectorFirst, utf16VectorSecond);
+
+                // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
+                if (Sse41.IsSupported)
+                {
+                    if (!Sse41.TestZ(combinedVector, asciiMaskForTestZ))
+                    {
+                        goto FoundNonAsciiDataInLoop;
+                    }
+                }
+                else
+                {
+                    if ((Sse2.MoveMask(Sse2.AddSaturate(combinedVector.AsUInt16(), asciiMaskForAddSaturate).AsByte()) & NonAsciiDataSeenMask) != 0)
+                    {
+                        goto FoundNonAsciiDataInLoop;
+                    }
+                }
+
+                // Build up the ASCII vector and perform the store.
+
+                asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorSecond);
+
+                Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector128 == 0, "Write should be aligned.");
+                Sse2.StoreAligned(pAsciiBuffer + currentOffsetInElements, asciiVector); // aligned
+
+                currentOffsetInElements += SizeOfVector128;
+            } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop);
+
+        Finish:
+
+            // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain.
+            return currentOffsetInElements;
+
+        FoundNonAsciiDataInLoop:
+
+            // Can we at least narrow the high vector?
+            // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
+            if (Sse41.IsSupported)
+            {
+                if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForTestZ))
+                {
+                    goto Finish; // found non-ASCII data
+                }
+            }
+            else
+            {
+                if ((Sse2.MoveMask(Sse2.AddSaturate(utf16VectorFirst.AsUInt16(), asciiMaskForAddSaturate).AsByte()) & NonAsciiDataSeenMask) != 0)
+                {
+                    goto Finish; // found non-ASCII data
+                }
+            }
+
+            // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector.
+            asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
+
+            Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
+
+            Sse2.StoreScalar((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is aligned
+            currentOffsetInElements += SizeOfVector128 / 2;
+
+            goto Finish;
+        }
+
+        /// <summary>
+        /// Copies as many ASCII bytes (00..7F) as possible from <paramref name="pAsciiBuffer"/>
+        /// to <paramref name="pUtf16Buffer"/>, stopping when the first non-ASCII byte is encountered
+        /// or once <paramref name="elementCount"/> elements have been converted. Returns the total number
+        /// of elements that were able to be converted.
+        /// </summary>
+        public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
+        {
+            // Intrinsified in mono interpreter
+            nuint currentOffset = 0;
+
+            // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
+            // code below. This has two benefits: (a) we can take advantage of specific instructions like
+            // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
+            // this method is running.
+
+            if (BitConverter.IsLittleEndian && (Sse2.IsSupported || AdvSimd.Arm64.IsSupported))
+            {
+                if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
+                {
+                    currentOffset = WidenAsciiToUtf16_Intrinsified(pAsciiBuffer, pUtf16Buffer, elementCount);
+                }
+            }
+            else if (Vector.IsHardwareAccelerated)
+            {
+                uint SizeOfVector = (uint)Unsafe.SizeOf<Vector<byte>>(); // JIT will make this a const
+
+                // Only bother vectorizing if we have enough data to do so.
+                if (elementCount >= SizeOfVector)
+                {
+                    // Note use of SBYTE instead of BYTE below; we're using the two's-complement
+                    // representation of negative integers to act as a surrogate for "is ASCII?".
+
+                    nuint finalOffsetWhereCanLoop = elementCount - SizeOfVector;
+                    do
+                    {
+                        Vector<sbyte> asciiVector = Unsafe.ReadUnaligned<Vector<sbyte>>(pAsciiBuffer + currentOffset);
+                        if (Vector.LessThanAny(asciiVector, Vector<sbyte>.Zero))
+                        {
+                            break; // found non-ASCII data
+                        }
+
+                        Vector.Widen(Vector.AsVectorByte(asciiVector), out Vector<ushort> utf16LowVector, out Vector<ushort> utf16HighVector);
+
+                        // TODO: Is the below logic also valid for big-endian platforms?
+                        Unsafe.WriteUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset, utf16LowVector);
+                        Unsafe.WriteUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset + Vector<ushort>.Count, utf16HighVector);
+
+                        currentOffset += SizeOfVector;
+                    } while (currentOffset <= finalOffsetWhereCanLoop);
+                }
+            }
+
+            Debug.Assert(currentOffset <= elementCount);
+            nuint remainingElementCount = elementCount - currentOffset;
+
+            // Try to widen 32 bits -> 64 bits at a time.
+            // We needn't update remainingElementCount after this point.
+
+            uint asciiData;
+
+            if (remainingElementCount >= 4)
+            {
+                nuint finalOffsetWhereCanLoop = currentOffset + remainingElementCount - 4;
+                do
+                {
+                    asciiData = Unsafe.ReadUnaligned<uint>(pAsciiBuffer + currentOffset);
+                    if (!AllBytesInUInt32AreAscii(asciiData))
+                    {
+                        goto FoundNonAsciiData;
+                    }
+
+                    WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pUtf16Buffer[currentOffset], asciiData);
+                    currentOffset += 4;
+                } while (currentOffset <= finalOffsetWhereCanLoop);
+            }
+
+            // Try to widen 16 bits -> 32 bits.
+
+            if (((uint)remainingElementCount & 2) != 0)
+            {
+                asciiData = Unsafe.ReadUnaligned<ushort>(pAsciiBuffer + currentOffset);
+                if (!AllBytesInUInt32AreAscii(asciiData))
+                {
+                    goto FoundNonAsciiData;
+                }
+
+                if (BitConverter.IsLittleEndian)
+                {
+                    pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
+                    pUtf16Buffer[currentOffset + 1] = (char)(asciiData >> 8);
+                }
+                else
+                {
+                    pUtf16Buffer[currentOffset + 1] = (char)(byte)asciiData;
+                    pUtf16Buffer[currentOffset] = (char)(asciiData >> 8);
+                }
+
+                currentOffset += 2;
+            }
+
+            // Try to widen 8 bits -> 16 bits.
+
+            if (((uint)remainingElementCount & 1) != 0)
+            {
+                asciiData = pAsciiBuffer[currentOffset];
+                if (((byte)asciiData & 0x80) != 0)
+                {
+                    goto Finish;
+                }
+
+                pUtf16Buffer[currentOffset] = (char)asciiData;
+                currentOffset++;
+            }
+
+        Finish:
+
+            return currentOffset;
+
+        FoundNonAsciiData:
+
+            Debug.Assert(!AllBytesInUInt32AreAscii(asciiData), "Shouldn't have reached this point if we have an all-ASCII input.");
+
+            // Drain ASCII bytes one at a time.
+
+            while (((byte)asciiData & 0x80) == 0)
+            {
+                pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
+                currentOffset++;
+                asciiData >>= 8;
+            }
+
+            goto Finish;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool ContainsNonAsciiByte(Vector128<byte> value)
+        {
+            if (!AdvSimd.Arm64.IsSupported)
+            {
+                throw new PlatformNotSupportedException();
+            }
+            value = AdvSimd.Arm64.MaxPairwise(value, value);
+            return (value.AsUInt64().ToScalar() & 0x8080808080808080) != 0;
+        }
+
+        private static unsafe nuint WidenAsciiToUtf16_Intrinsified(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
+        {
+            // JIT turns the below into constants
+
+            uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
+            nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
+
+            // This method is written such that control generally flows top-to-bottom, avoiding
+            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
+            // data, we jump out of the hot paths to targets at the end of the method.
+
+            Debug.Assert(Sse2.IsSupported || AdvSimd.Arm64.IsSupported);
+            Debug.Assert(BitConverter.IsLittleEndian);
+            Debug.Assert(elementCount >= 2 * SizeOfVector128);
+
+            // We're going to get the best performance when we have aligned writes, so we'll take the
+            // hit of potentially unaligned reads in order to hit this sweet spot.
+
+            Vector128<byte> asciiVector;
+            Vector128<byte> utf16FirstHalfVector;
+            bool containsNonAsciiBytes;
+
+            // First, perform an unaligned read of the first part of the input buffer.
+
+            if (Sse2.IsSupported)
+            {
+                asciiVector = Sse2.LoadVector128(pAsciiBuffer); // unaligned load
+                containsNonAsciiBytes = (uint)Sse2.MoveMask(asciiVector) != 0;
+            }
+            else if (AdvSimd.Arm64.IsSupported)
+            {
+                asciiVector = AdvSimd.LoadVector128(pAsciiBuffer);
+                containsNonAsciiBytes = ContainsNonAsciiByte(asciiVector);
+            }
+            else
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+            // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
+
+            if (containsNonAsciiBytes)
+            {
+                return 0;
+            }
+
+            // Then perform an unaligned write of the first part of the input buffer.
+
+            Vector128<byte> zeroVector = Vector128<byte>.Zero;
+
+            if (Sse2.IsSupported)
+            {
+                utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
+                Sse2.Store((byte*)pUtf16Buffer, utf16FirstHalfVector); // unaligned
+            }
+            else if (AdvSimd.IsSupported)
+            {
+                utf16FirstHalfVector = AdvSimd.ZeroExtendWideningLower(asciiVector.GetLower()).AsByte();
+                AdvSimd.Store((byte*)pUtf16Buffer, utf16FirstHalfVector); // unaligned
+            }
+            else
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+            // Calculate how many elements we wrote in order to get pOutputBuffer to its next alignment
+            // point, then use that as the base offset going forward. Remember the >> 1 to account for
+            // that we wrote chars, not bytes. This means we may re-read data in the next iteration of
+            // the loop, but this is ok.
+
+            nuint currentOffset = (SizeOfVector128 >> 1) - (((nuint)pUtf16Buffer >> 1) & (MaskOfAllBitsInVector128 >> 1));
+            Debug.Assert(0 < currentOffset && currentOffset <= SizeOfVector128 / sizeof(char));
+
+            nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;
+
+            // Calculating the destination address outside the loop results in significant
+            // perf wins vs. relying on the JIT to fold memory addressing logic into the
+            // write instructions. See: https://github.com/dotnet/runtime/issues/33002
+
+            char* pCurrentWriteAddress = pUtf16Buffer + currentOffset;
+
+            do
+            {
+                // In a loop, perform an unaligned read, widen to two vectors, then aligned write the two vectors.
+
+                if (Sse2.IsSupported)
+                {
+                    asciiVector = Sse2.LoadVector128(pAsciiBuffer + currentOffset); // unaligned load
+                    containsNonAsciiBytes = (uint)Sse2.MoveMask(asciiVector) != 0;
+                }
+                else if (AdvSimd.Arm64.IsSupported)
+                {
+                    asciiVector = AdvSimd.LoadVector128(pAsciiBuffer + currentOffset);
+                    containsNonAsciiBytes = ContainsNonAsciiByte(asciiVector);
+                }
+                else
+                {
+                    throw new PlatformNotSupportedException();
+                }
+
+                if (containsNonAsciiBytes)
+                {
+                    // non-ASCII byte somewhere
+                    goto NonAsciiDataSeenInInnerLoop;
+                }
+
+                if (Sse2.IsSupported)
+                {
+                    Vector128<byte> low = Sse2.UnpackLow(asciiVector, zeroVector);
+                    Sse2.StoreAligned((byte*)pCurrentWriteAddress, low);
+
+                    Vector128<byte> high = Sse2.UnpackHigh(asciiVector, zeroVector);
+                    Sse2.StoreAligned((byte*)pCurrentWriteAddress + SizeOfVector128, high);
+                }
+                else if (AdvSimd.Arm64.IsSupported)
+                {
+                    Vector128<ushort> low = AdvSimd.ZeroExtendWideningLower(asciiVector.GetLower());
+                    Vector128<ushort> high = AdvSimd.ZeroExtendWideningUpper(asciiVector);
+                    AdvSimd.Arm64.StorePair((ushort*)pCurrentWriteAddress, low, high);
+                }
+                else
+                {
+                    throw new PlatformNotSupportedException();
+                }
+
+                currentOffset += SizeOfVector128;
+                pCurrentWriteAddress += SizeOfVector128;
+            } while (currentOffset <= finalOffsetWhereCanRunLoop);
+
+        Finish:
+
+            return currentOffset;
+
+        NonAsciiDataSeenInInnerLoop:
+
+            // Can we at least widen the first part of the vector?
+
+            if (!containsNonAsciiBytes)
+            {
+                // First part was all ASCII, widen
+                if (Sse2.IsSupported)
+                {
+                    utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
+                    Sse2.StoreAligned((byte*)(pUtf16Buffer + currentOffset), utf16FirstHalfVector);
+                }
+                else if (AdvSimd.Arm64.IsSupported)
+                {
+                    Vector128<ushort> lower = AdvSimd.ZeroExtendWideningLower(asciiVector.GetLower());
+                    AdvSimd.Store((ushort*)(pUtf16Buffer + currentOffset), lower);
+                }
+                else
+                {
+                    throw new PlatformNotSupportedException();
+                }
+                currentOffset += SizeOfVector128 / 2;
+            }
+
+            goto Finish;
+        }
+
+        /// <summary>
+        /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
+        /// writes them to the output buffer with machine endianness.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value)
+        {
+            Debug.Assert(AllBytesInUInt32AreAscii(value));
+
+            if (Sse2.X64.IsSupported)
+            {
+                Debug.Assert(BitConverter.IsLittleEndian, "SSE2 widening assumes little-endian.");
+                Vector128<byte> vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte();
+                Vector128<ulong> vecWide = Sse2.UnpackLow(vecNarrow, Vector128<byte>.Zero).AsUInt64();
+                Unsafe.WriteUnaligned<ulong>(ref Unsafe.As<char, byte>(ref outputBuffer), Sse2.X64.ConvertToUInt64(vecWide));
+            }
+            else if (AdvSimd.Arm64.IsSupported)
+            {
+                Vector128<byte> vecNarrow = AdvSimd.DuplicateToVector128(value).AsByte();
+                Vector128<ulong> vecWide = AdvSimd.Arm64.ZipLow(vecNarrow, Vector128<byte>.Zero).AsUInt64();
+                Unsafe.WriteUnaligned<ulong>(ref Unsafe.As<char, byte>(ref outputBuffer), vecWide.ToScalar());
+            }
+            else
+            {
+                if (BitConverter.IsLittleEndian)
+                {
+                    outputBuffer = (char)(byte)value;
+                    value >>= 8;
+                    Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
+                    value >>= 8;
+                    Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
+                    value >>= 8;
+                    Unsafe.Add(ref outputBuffer, 3) = (char)value;
+                }
+                else
+                {
+                    Unsafe.Add(ref outputBuffer, 3) = (char)(byte)value;
+                    value >>= 8;
+                    Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
+                    value >>= 8;
+                    Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
+                    value >>= 8;
+                    outputBuffer = (char)value;
+                }
+            }
+        }
+    }
+}
+#endif
diff --git a/src/DotNetty.Common/Internal/ASCIIUtility.NetCore3.cs b/src/DotNetty.Common/Internal/ASCIIUtility.NetCore3.cs
new file mode 100644
index 000000000..a642045ee
--- /dev/null
+++ b/src/DotNetty.Common/Internal/ASCIIUtility.NetCore3.cs
@@ -0,0 +1,1092 @@
+﻿// borrowed from https://github.com/dotnet/corefx/blob/release/3.1/src/Common/src/CoreLib/System/Text/ASCIIUtility.cs
+
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#if NETCOREAPP3_1
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace DotNetty.Common.Internal
+{
+    partial class ASCIIUtility
+    {
+        /// <summary>
+        /// Returns the index in <paramref name="pBuffer"/> where the first non-ASCII byte is found.
+        /// Returns <paramref name="bufferLength"/> if the buffer is empty or all-ASCII.
+        /// </summary>
+        /// <returns>An ASCII byte is defined as 0x00 - 0x7F, inclusive.</returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bufferLength)
+        {
+            // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
+            // code below. This has two benefits: (a) we can take advantage of specific instructions like
+            // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
+            // this method is running.
+
+            return Sse2.IsSupported
+                ? GetIndexOfFirstNonAsciiByte_Sse2(pBuffer, bufferLength)
+                : GetIndexOfFirstNonAsciiByte_Default(pBuffer, bufferLength);
+        }
+
+        private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuint bufferLength)
+        {
+            // JIT turns the below into constants
+
+            uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
+            nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
+
+            Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
+            Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
+
+            uint currentMask, secondMask;
+            byte* pOriginalBuffer = pBuffer;
+
+            // This method is written such that control generally flows top-to-bottom, avoiding
+            // jumps as much as possible in the optimistic case of a large enough buffer and
+            // "all ASCII". If we see non-ASCII data, we jump out of the hot paths to targets
+            // after all the main logic.
+
+            if (bufferLength < SizeOfVector128)
+            {
+                goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
+            }
+
+            // Read the first vector unaligned.
+
+            currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
+
+            if (currentMask != 0)
+            {
+                goto FoundNonAsciiDataInCurrentMask;
+            }
+
+            // If we have less than 32 bytes to process, just go straight to the final unaligned
+            // read. There's no need to mess with the loop logic in the middle of this method.
+
+            if (bufferLength < 2 * SizeOfVector128)
+            {
+                goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
+            }
+
+            // Now adjust the read pointer so that future reads are aligned.
+
+            pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector128) & ~(nuint)MaskOfAllBitsInVector128);
+
+#if DEBUG
+            long numBytesRead = pBuffer - pOriginalBuffer;
+            Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector128, "We should've made forward progress of at least one byte.");
+            Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
+#endif
+
+            // Adjust the remaining length to account for what we just read.
+
+            bufferLength += (nuint)pOriginalBuffer;
+            bufferLength -= (nuint)pBuffer;
+
+            // The buffer is now properly aligned.
+            // Read 2 vectors at a time if possible.
+
+            if (bufferLength >= 2 * SizeOfVector128)
+            {
+                byte* pFinalVectorReadPos = (byte*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128);
+
+                // After this point, we no longer need to update the bufferLength value.
+
+                do
+                {
+                    Vector128<byte> firstVector = Sse2.LoadAlignedVector128(pBuffer);
+                    Vector128<byte> secondVector = Sse2.LoadAlignedVector128(pBuffer + SizeOfVector128);
+
+                    currentMask = (uint)Sse2.MoveMask(firstVector);
+                    secondMask = (uint)Sse2.MoveMask(secondVector);
+
+                    if ((currentMask | secondMask) != 0)
+                    {
+                        goto FoundNonAsciiDataInInnerLoop;
+                    }
+
+                    pBuffer += 2 * SizeOfVector128;
+                } while (pBuffer <= pFinalVectorReadPos);
+            }
+
+            // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
+            // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
+            // But we _can_ rely on it to tell us how much remaining data must be drained by looking
+            // at what bits of it are set. This works because had we updated it within the loop above,
+            // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
+            // bits which are less significant than those that the addition would've acted on.
+
+            // If there is fewer than one vector length remaining, skip the next aligned read.
+
+            if (0ul >= (bufferLength & SizeOfVector128))
+            {
+                goto DoFinalUnalignedVectorRead;
+            }
+
+            // At least one full vector's worth of data remains, so we can safely read it.
+            // Remember, at this point pBuffer is still aligned.
+
+            currentMask = (uint)Sse2.MoveMask(Sse2.LoadAlignedVector128(pBuffer));
+            if (currentMask != 0)
+            {
+                goto FoundNonAsciiDataInCurrentMask;
+            }
+
+        IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
+
+            pBuffer += SizeOfVector128;
+
+        DoFinalUnalignedVectorRead:
+
+            if (((byte)bufferLength & MaskOfAllBitsInVector128) != 0)
+            {
+                // Perform an unaligned read of the last vector.
+                // We need to adjust the pointer because we're re-reading data.
+
+                pBuffer += (bufferLength & MaskOfAllBitsInVector128) - SizeOfVector128;
+
+                currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
+                if (currentMask != 0)
+                {
+                    goto FoundNonAsciiDataInCurrentMask;
+                }
+
+                pBuffer += SizeOfVector128;
+            }
+
+        Finish:
+
+            return (nuint)pBuffer - (nuint)pOriginalBuffer; // and we're done!
+
+        FoundNonAsciiDataInInnerLoop:
+
+            // If the current (first) mask isn't the mask that contains non-ASCII data, then it must
+            // instead be the second mask. If so, skip the entire first mask and drain ASCII bytes
+            // from the second mask.
+
+            if (0u >= currentMask)
+            {
+                pBuffer += SizeOfVector128;
+                currentMask = secondMask;
+            }
+
+        FoundNonAsciiDataInCurrentMask:
+
+            // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
+            // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
+            // available, we'll fall back to a normal loop.
+
+            Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
+            pBuffer += (uint)BitOperations.TrailingZeroCount(currentMask);
+
+            goto Finish;
+
+        FoundNonAsciiDataInCurrentDWord:
+
+            uint currentDWord;
+            Debug.Assert(!AllBytesInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
+            pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentDWord);
+
+            goto Finish;
+
+        InputBufferLessThanOneVectorInLength:
+
+            // These code paths get hit if the original input length was less than one vector in size.
+            // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
+            // directly. Note that all of these reads are unaligned.
+
+            Debug.Assert(bufferLength < SizeOfVector128);
+
+            // QWORD drain
+
+            if ((bufferLength & 8) != 0)
+            {
+                if (Bmi1.X64.IsSupported)
+                {
+                    // If we can use 64-bit tzcnt to count the number of leading ASCII bytes, prefer it.
+
+                    ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
+                    if (!AllBytesInUInt64AreAscii(candidateUInt64))
+                    {
+                        // Clear everything but the high bit of each byte, then tzcnt.
+                        // Remember the / 8 at the end to convert bit count to byte count.
+
+                        candidateUInt64 &= UInt64HighBitsOnlyMask;
+                        pBuffer += (nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8);
+                        goto Finish;
+                    }
+                }
+                else
+                {
+                    // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.
+
+                    currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
+                    uint nextDWord = Unsafe.ReadUnaligned<uint>(pBuffer + 4);
+
+                    if (!AllBytesInUInt32AreAscii(currentDWord | nextDWord))
+                    {
+                        // At least one of the values wasn't all-ASCII.
+                        // We need to figure out which one it was and stick it in the currentMask local.
+
+                        if (AllBytesInUInt32AreAscii(currentDWord))
+                        {
+                            currentDWord = nextDWord; // this one is the culprit
+                            pBuffer += 4;
+                        }
+
+                        goto FoundNonAsciiDataInCurrentDWord;
+                    }
+                }
+
+                pBuffer += 8; // successfully consumed 8 ASCII bytes
+            }
+
+            // DWORD drain
+
+            if ((bufferLength & 4) != 0)
+            {
+                currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
+
+                if (!AllBytesInUInt32AreAscii(currentDWord))
+                {
+                    goto FoundNonAsciiDataInCurrentDWord;
+                }
+
+                pBuffer += 4; // successfully consumed 4 ASCII bytes
+            }
+
+            // WORD drain
+            // (We movzx to a DWORD for ease of manipulation.)
+
+            if ((bufferLength & 2) != 0)
+            {
+                currentDWord = Unsafe.ReadUnaligned<ushort>(pBuffer);
+
+                if (!AllBytesInUInt32AreAscii(currentDWord))
+                {
+                    // We only care about the 0x0080 bit of the value. If it's not set, then we
+                    // increment currentOffset by 1. If it's set, we don't increment it at all.
+
+                    pBuffer += (nuint)((nint)(sbyte)currentDWord >> 7) + 1;
+                    goto Finish;
+                }
+
+                pBuffer += 2; // successfully consumed 2 ASCII bytes
+            }
+
+            // BYTE drain
+
+            if ((bufferLength & 1) != 0)
+            {
+                // sbyte has non-negative value if byte is ASCII.
+
+                if (*(sbyte*)(pBuffer) >= 0)
+                {
+                    pBuffer++; // successfully consumed a single byte
+                }
+            }
+
+            goto Finish;
+        }
+
+        private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuint bufferLength /* in chars */)
+        {
+            // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
+            // will be elided by JIT once we determine which specific ISAs we support.
+
+            // Quick check for empty inputs.
+
+            if (0ul >= bufferLength)
+            {
+                return 0;
+            }
+
+            // JIT turns the below into constants
+
+            uint SizeOfVector128InBytes = (uint)Unsafe.SizeOf<Vector128<byte>>();
+            uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char);
+
+            Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
+            Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
+
+            Vector128<short> firstVector, secondVector;
+            uint currentMask;
+            char* pOriginalBuffer = pBuffer;
+
+            if (bufferLength < SizeOfVector128InChars)
+            {
+                goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
+            }
+
+            // This method is written such that control generally flows top-to-bottom, avoiding
+            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
+            // data, we jump out of the hot paths to targets at the end of the method.
+
+            Vector128<short> asciiMaskForPTEST = Vector128.Create(unchecked((short)0xFF80)); // used for PTEST on supported hardware
+            Vector128<ushort> asciiMaskForPMINUW = Vector128.Create((ushort)0x0080); // used for PMINUW on supported hardware
+            Vector128<short> asciiMaskForPXOR = Vector128.Create(unchecked((short)0x8000)); // used for PXOR
+            Vector128<short> asciiMaskForPCMPGTW = Vector128.Create(unchecked((short)0x807F)); // used for PCMPGTW
+
+#if NET
+            Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
+#endif
+
+            // Read the first vector unaligned.
+
+            firstVector = Sse2.LoadVector128((short*)pBuffer); // unaligned load
+
+            if (Sse41.IsSupported)
+            {
+                // The SSE41-optimized code path works by forcing the 0x0080 bit in each WORD of the vector to be
+                // set iff the WORD element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector
+                // in order to extract the mask.
+                currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
+            }
+            else
+            {
+                // The SSE2-optimized code path works by forcing each WORD of the vector to be 0xFFFF iff the WORD
+                // element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector in order to extract
+                // the mask.
+                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
+            }
+
+            if (currentMask != 0)
+            {
+                goto FoundNonAsciiDataInCurrentMask;
+            }
+
+            // If we have less than 32 bytes to process, just go straight to the final unaligned
+            // read. There's no need to mess with the loop logic in the middle of this method.
+
+            // Adjust the remaining length to account for what we just read.
+            // For the remainder of this code path, bufferLength will be in bytes, not chars.
+
+            bufferLength <<= 1; // chars to bytes
+
+            if (bufferLength < 2 * SizeOfVector128InBytes)
+            {
+                goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
+            }
+
+            // Now adjust the read pointer so that future reads are aligned.
+
+            pBuffer = (char*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1));
+
+#if DEBUG
+            long numCharsRead = pBuffer - pOriginalBuffer;
+            Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector128InChars, "We should've made forward progress of at least one char.");
+            Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
+#endif
+
+            // Adjust remaining buffer length.
+
+            bufferLength += (nuint)pOriginalBuffer;
+            bufferLength -= (nuint)pBuffer;
+
+            // The buffer is now properly aligned.
+            // Read 2 vectors at a time if possible.
+
+            if (bufferLength >= 2 * SizeOfVector128InBytes)
+            {
+                char* pFinalVectorReadPos = (char*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes);
+
+                // After this point, we no longer need to update the bufferLength value.
+
+                do
+                {
+                    firstVector = Sse2.LoadAlignedVector128((short*)pBuffer);
+                    secondVector = Sse2.LoadAlignedVector128((short*)pBuffer + SizeOfVector128InChars);
+                    Vector128<short> combinedVector = Sse2.Or(firstVector, secondVector);
+
+                    if (Sse41.IsSupported)
+                    {
+                        // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
+                        // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
+                        if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
+                        {
+                            goto FoundNonAsciiDataInFirstOrSecondVector;
+                        }
+                    }
+                    else
+                    {
+                        // See comment earlier in the method for an explanation of how the below logic works.
+                        if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
+                        {
+                            goto FoundNonAsciiDataInFirstOrSecondVector;
+                        }
+                    }
+
+                    pBuffer += 2 * SizeOfVector128InChars;
+                } while (pBuffer <= pFinalVectorReadPos);
+            }
+
+            // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
+            // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
+            // But we _can_ rely on it to tell us how much remaining data must be drained by looking
+            // at what bits of it are set. This works because had we updated it within the loop above,
+            // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
+            // bits which are less significant than those that the addition would've acted on.
+
+            // If there is fewer than one vector length remaining, skip the next aligned read.
+            // Remember, at this point bufferLength is measured in bytes, not chars.
+
+            if (0ul >= (bufferLength & SizeOfVector128InBytes))
+            {
+                goto DoFinalUnalignedVectorRead;
+            }
+
+            // At least one full vector's worth of data remains, so we can safely read it.
+            // Remember, at this point pBuffer is still aligned.
+
+            firstVector = Sse2.LoadAlignedVector128((short*)pBuffer);
+
+            if (Sse41.IsSupported)
+            {
+                // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
+                // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
+                if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
+                {
+                    goto FoundNonAsciiDataInFirstVector;
+                }
+            }
+            else
+            {
+                // See comment earlier in the method for an explanation of how the below logic works.
+                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
+                if (currentMask != 0)
+                {
+                    goto FoundNonAsciiDataInCurrentMask;
+                }
+            }
+
+        IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
+
+            pBuffer += SizeOfVector128InChars;
+
+        DoFinalUnalignedVectorRead:
+
+            if (((byte)bufferLength & (SizeOfVector128InBytes - 1)) != 0)
+            {
+                // Perform an unaligned read of the last vector.
+                // We need to adjust the pointer because we're re-reading data.
+
+                pBuffer = (char*)((byte*)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes);
+                firstVector = Sse2.LoadVector128((short*)pBuffer); // unaligned load
+
+                if (Sse41.IsSupported)
+                {
+                    // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
+                    // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
+                    if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
+                    {
+                        goto FoundNonAsciiDataInFirstVector;
+                    }
+                }
+                else
+                {
+                    // See comment earlier in the method for an explanation of how the below logic works.
+                    currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
+                    if (currentMask != 0)
+                    {
+                        goto FoundNonAsciiDataInCurrentMask;
+                    }
+                }
+
+                pBuffer += SizeOfVector128InChars;
+            }
+
+        Finish:
+
+            Debug.Assert(((nuint)pBuffer - (nuint)pOriginalBuffer) % 2 == 0, "Shouldn't have incremented any pointer by an odd byte count.");
+            return ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char); // and we're done! (remember to adjust for char count)
+
+        FoundNonAsciiDataInFirstOrSecondVector:
+
+            // We don't know if the first or the second vector contains non-ASCII data. Check the first
+            // vector, and if that's all-ASCII then the second vector must be the culprit. Either way
+            // we'll make sure the first vector local is the one that contains the non-ASCII data.
+
+            // See comment earlier in the method for an explanation of how the below logic works.
+            if (Sse41.IsSupported)
+            {
+                if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
+                {
+                    goto FoundNonAsciiDataInFirstVector;
+                }
+            }
+            else
+            {
+                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
+                if (currentMask != 0)
+                {
+                    goto FoundNonAsciiDataInCurrentMask;
+                }
+            }
+
+            // Wasn't the first vector; must be the second.
+
+            pBuffer += SizeOfVector128InChars;
+            firstVector = secondVector;
+
+        FoundNonAsciiDataInFirstVector:
+
+            // See comment earlier in the method for an explanation of how the below logic works.
+            if (Sse41.IsSupported)
+            {
+                currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
+            }
+            else
+            {
+                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
+            }
+
+        FoundNonAsciiDataInCurrentMask:
+
+            // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
+            // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
+            // available, we'll fall back to a normal loop. (Even though the original vector used WORD elements,
+            // masks work on BYTE elements, and we account for this in the final fixup.)
+
+            Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
+            pBuffer = (char*)((byte*)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask));
+
+            goto Finish;
+
+        FoundNonAsciiDataInCurrentDWord:
+
+            uint currentDWord;
+            Debug.Assert(!AllCharsInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
+
+            if (FirstCharInUInt32IsAscii(currentDWord))
+            {
+                pBuffer++; // skip past the ASCII char
+            }
+
+            goto Finish;
+
+        InputBufferLessThanOneVectorInLength:
+
+            // These code paths get hit if the original input length was less than one vector in size.
+            // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
+            // directly. Note that all of these reads are unaligned.
+
+            // Reminder: If this code path is hit, bufferLength is still a char count, not a byte count.
+            // We skipped the code path that multiplied the count by sizeof(char).
+
+            Debug.Assert(bufferLength < SizeOfVector128InChars);
+
+            // QWORD drain
+
+            if ((bufferLength & 4) != 0)
+            {
+                if (Bmi1.X64.IsSupported)
+                {
+                    // If we can use 64-bit tzcnt to count the number of leading ASCII chars, prefer it.
+
+                    ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
+                    if (!AllCharsInUInt64AreAscii(candidateUInt64))
+                    {
+                        // Clear the low 7 bits (the ASCII bits) of each char, then tzcnt.
+                        // Remember the / 8 at the end to convert bit count to byte count,
+                        // then the & ~1 at the end to treat a match in the high byte of
+                        // any char the same as a match in the low byte of that same char.
+
+                        candidateUInt64 &= 0xFF80FF80_FF80FF80ul;
+                        pBuffer = (char*)((byte*)pBuffer + ((nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8) & ~(nuint)1));
+                        goto Finish;
+                    }
+                }
+                else
+                {
+                    // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.
+
+                    currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
+                    uint nextDWord = Unsafe.ReadUnaligned<uint>(pBuffer + 4 / sizeof(char));
+
+                    if (!AllCharsInUInt32AreAscii(currentDWord | nextDWord))
+                    {
+                        // At least one of the values wasn't all-ASCII.
+                        // We need to figure out which one it was and stick it in the currentMask local.
+
+                        if (AllCharsInUInt32AreAscii(currentDWord))
+                        {
+                            currentDWord = nextDWord; // this one is the culprit
+                            pBuffer += 4 / sizeof(char);
+                        }
+
+                        goto FoundNonAsciiDataInCurrentDWord;
+                    }
+                }
+
+                pBuffer += 4; // successfully consumed 4 ASCII chars
+            }
+
+            // DWORD drain
+
+            if ((bufferLength & 2) != 0)
+            {
+                currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
+
+                if (!AllCharsInUInt32AreAscii(currentDWord))
+                {
+                    goto FoundNonAsciiDataInCurrentDWord;
+                }
+
+                pBuffer += 2; // successfully consumed 2 ASCII chars
+            }
+
+            // WORD drain
+            // This is the final drain; there's no need for a BYTE drain since our elemental type is 16-bit char.
+
+            if ((bufferLength & 1) != 0)
+            {
+                if (*pBuffer <= 0x007F)
+                {
+                    pBuffer++; // successfully consumed a single char
+                }
+            }
+
+            goto Finish;
+        }
+
+        private static unsafe nuint NarrowUtf16ToAscii_Sse2(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
+        {
+            // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
+            // will be elided by JIT once we determine which specific ISAs we support.
+
+            // JIT turns the below into constants
+
+            uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
+            nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
+
+            // This method is written such that control generally flows top-to-bottom, avoiding
+            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
+            // data, we jump out of the hot paths to targets at the end of the method.
+
+            Debug.Assert(Sse2.IsSupported);
+            Debug.Assert(BitConverter.IsLittleEndian);
+            Debug.Assert(elementCount >= 2 * SizeOfVector128);
+
+            Vector128<short> asciiMaskForPTEST = Vector128.Create(unchecked((short)0xFF80)); // used for PTEST on supported hardware
+            Vector128<short> asciiMaskForPXOR = Vector128.Create(unchecked((short)0x8000)); // used for PXOR
+            Vector128<short> asciiMaskForPCMPGTW = Vector128.Create(unchecked((short)0x807F)); // used for PCMPGTW
+
+            // First, perform an unaligned read of the first part of the input buffer.
+
+            Vector128<short> utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer); // unaligned load
+
+            // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
+            // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
+
+            if (Sse41.IsSupported)
+            {
+                if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
+                {
+                    return 0;
+                }
+            }
+            else
+            {
+                if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
+                {
+                    return 0;
+                }
+            }
+
+            // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
+
+            Vector128<byte> asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
+            Sse2.StoreScalar((ulong*)pAsciiBuffer, asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
+
+            nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far
+
+            // We're going to get the best performance when we have aligned writes, so we'll take the
+            // hit of potentially unaligned reads in order to hit this sweet spot.
+
+            // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote
+            // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote
+            // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In
+            // that case we can immediately back up to the previous aligned boundary and start the main loop.
+            // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at
+            // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump
+            // just past the next aligned boundary address.
+
+            if (0u >= ((uint)pAsciiBuffer & (SizeOfVector128 / 2)))
+            {
+                // We need to perform one more partial vector write before we can get the alignment we want.
+
+                utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements); // unaligned load
+
+                // See comments earlier in this method for information about how this works.
+                if (Sse41.IsSupported)
+                {
+                    if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
+                    {
+                        goto Finish;
+                    }
+                }
+                else
+                {
+                    if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
+                    {
+                        goto Finish;
+                    }
+                }
+
+                // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
+                asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
+                Sse2.StoreScalar((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
+            }
+
+            // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
+            // point, then use that as the base offset going forward.
+
+            currentOffsetInElements = SizeOfVector128 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector128);
+            Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector128, "We wrote at least 1 byte but no more than a whole vector.");
+
+            Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer.");
+            Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector128, "We should be able to run at least one whole vector.");
+
+            nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;
+            do
+            {
+                // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector.
+
+                utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements); // unaligned load
+                Vector128<short> utf16VectorSecond = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements + SizeOfVector128 / sizeof(short)); // unaligned load
+                Vector128<short> combinedVector = Sse2.Or(utf16VectorFirst, utf16VectorSecond);
+
+                // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
+                if (Sse41.IsSupported)
+                {
+                    if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
+                    {
+                        goto FoundNonAsciiDataInLoop;
+                    }
+                }
+                else
+                {
+                    if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
+                    {
+                        goto FoundNonAsciiDataInLoop;
+                    }
+                }
+
+                // Build up the UTF-8 vector and perform the store.
+
+                asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorSecond);
+
+                Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector128 == 0, "Write should be aligned.");
+                Sse2.StoreAligned(pAsciiBuffer + currentOffsetInElements, asciiVector); // aligned
+
+                currentOffsetInElements += SizeOfVector128;
+            } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop);
+
+        Finish:
+
+            // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain.
+            return currentOffsetInElements;
+
+        FoundNonAsciiDataInLoop:
+
+            // Can we at least narrow the high vector?
+            // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
+            if (Sse41.IsSupported)
+            {
+                if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
+                {
+                    goto Finish; // found non-ASCII data
+                }
+            }
+            else
+            {
+                if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
+                {
+                    goto Finish; // found non-ASCII data
+                }
+            }
+
+            // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector.
+            asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
+
+            Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
+
+            Sse2.StoreScalar((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is aligned
+            currentOffsetInElements += SizeOfVector128 / 2;
+
+            goto Finish;
+        }
+
+        /// <summary>
+        /// Copies as many ASCII bytes (00..7F) as possible from <paramref name="pAsciiBuffer"/>
+        /// to <paramref name="pUtf16Buffer"/>, stopping when the first non-ASCII byte is encountered
+        /// or once <paramref name="elementCount"/> elements have been converted. Returns the total number
+        /// of elements that were able to be converted.
+        /// </summary>
+        public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
+        {
+            nuint currentOffset = 0;
+
+            // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
+            // code below. This has two benefits: (a) we can take advantage of specific instructions like
+            // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
+            // this method is running.
+
+            if (Sse2.IsSupported)
+            {
+                if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
+                {
+                    currentOffset = WidenAsciiToUtf16_Sse2(pAsciiBuffer, pUtf16Buffer, elementCount);
+                }
+            }
+            else if (Vector.IsHardwareAccelerated)
+            {
+                uint SizeOfVector = (uint)Unsafe.SizeOf<Vector<byte>>(); // JIT will make this a const
+
+                // Only bother vectorizing if we have enough data to do so.
+                if (elementCount >= SizeOfVector)
+                {
+                    // Note use of SBYTE instead of BYTE below; we're using the two's-complement
+                    // representation of negative integers to act as a surrogate for "is ASCII?".
+
+                    nuint finalOffsetWhereCanLoop = elementCount - SizeOfVector;
+                    do
+                    {
+                        Vector<sbyte> asciiVector = Unsafe.ReadUnaligned<Vector<sbyte>>(pAsciiBuffer + currentOffset);
+                        if (Vector.LessThanAny(asciiVector, Vector<sbyte>.Zero))
+                        {
+                            break; // found non-ASCII data
+                        }
+
+                        Vector.Widen(Vector.AsVectorByte(asciiVector), out Vector<ushort> utf16LowVector, out Vector<ushort> utf16HighVector);
+
+                        // TODO: Is the below logic also valid for big-endian platforms?
+                        Unsafe.WriteUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset, utf16LowVector);
+                        Unsafe.WriteUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset + Vector<ushort>.Count, utf16HighVector);
+
+                        currentOffset += SizeOfVector;
+                    } while (currentOffset <= finalOffsetWhereCanLoop);
+                }
+            }
+
+            Debug.Assert(currentOffset <= elementCount);
+            nuint remainingElementCount = elementCount - currentOffset;
+
+            // Try to widen 32 bits -> 64 bits at a time.
+            // We needn't update remainingElementCount after this point.
+
+            uint asciiData;
+
+            if (remainingElementCount >= 4)
+            {
+                nuint finalOffsetWhereCanLoop = currentOffset + remainingElementCount - 4;
+                do
+                {
+                    asciiData = Unsafe.ReadUnaligned<uint>(pAsciiBuffer + currentOffset);
+                    if (!AllBytesInUInt32AreAscii(asciiData))
+                    {
+                        goto FoundNonAsciiData;
+                    }
+
+                    WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pUtf16Buffer[currentOffset], asciiData);
+                    currentOffset += 4;
+                } while (currentOffset <= finalOffsetWhereCanLoop);
+            }
+
+            // Try to widen 16 bits -> 32 bits.
+
+            if (((uint)remainingElementCount & 2) != 0)
+            {
+                asciiData = Unsafe.ReadUnaligned<ushort>(pAsciiBuffer + currentOffset);
+                if (!AllBytesInUInt32AreAscii(asciiData))
+                {
+                    goto FoundNonAsciiData;
+                }
+
+                if (BitConverter.IsLittleEndian)
+                {
+                    pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
+                    pUtf16Buffer[currentOffset + 1] = (char)(asciiData >> 8);
+                }
+                else
+                {
+                    pUtf16Buffer[currentOffset + 1] = (char)(byte)asciiData;
+                    pUtf16Buffer[currentOffset] = (char)(asciiData >> 8);
+                }
+
+                currentOffset += 2;
+            }
+
+            // Try to widen 8 bits -> 16 bits.
+
+            if (((uint)remainingElementCount & 1) != 0)
+            {
+                asciiData = pAsciiBuffer[currentOffset];
+                if (((byte)asciiData & 0x80) != 0)
+                {
+                    goto Finish;
+                }
+
+                pUtf16Buffer[currentOffset] = (char)asciiData;
+                currentOffset += 1;
+            }
+
+        Finish:
+
+            return currentOffset;
+
+        FoundNonAsciiData:
+
+            Debug.Assert(!AllBytesInUInt32AreAscii(asciiData), "Shouldn't have reached this point if we have an all-ASCII input.");
+
+            // Drain ASCII bytes one at a time.
+
+            while (0u >= (uint)((byte)asciiData & 0x80))
+            {
+                pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
+                currentOffset += 1;
+                asciiData >>= 8;
+            }
+
+            goto Finish;
+        }
+
+        private static unsafe nuint WidenAsciiToUtf16_Sse2(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
+        {
+            // JIT turns the below into constants
+
+            uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
+            nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
+
+            // This method is written such that control generally flows top-to-bottom, avoiding
+            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
+            // data, we jump out of the hot paths to targets at the end of the method.
+
+            Debug.Assert(Sse2.IsSupported);
+            Debug.Assert(BitConverter.IsLittleEndian);
+            Debug.Assert(elementCount >= 2 * SizeOfVector128);
+
+            // We're going to get the best performance when we have aligned writes, so we'll take the
+            // hit of potentially unaligned reads in order to hit this sweet spot.
+
+            Vector128<byte> asciiVector;
+            Vector128<byte> utf16FirstHalfVector;
+            uint mask;
+
+            // First, perform an unaligned read of the first part of the input buffer.
+
+            asciiVector = Sse2.LoadVector128(pAsciiBuffer); // unaligned load
+            mask = (uint)Sse2.MoveMask(asciiVector);
+
+            // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
+
+            if ((byte)mask != 0)
+            {
+                return 0;
+            }
+
+            // Then perform an unaligned write of the first part of the input buffer.
+
+            Vector128<byte> zeroVector = Vector128<byte>.Zero;
+
+            utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
+            Sse2.Store((byte*)pUtf16Buffer, utf16FirstHalfVector); // unaligned
+
+            // Calculate how many elements we wrote in order to get pOutputBuffer to its next alignment
+            // point, then use that as the base offset going forward. Remember the >> 1 to account for
+            // that we wrote chars, not bytes. This means we may re-read data in the next iteration of
+            // the loop, but this is ok.
+
+            nuint currentOffset = (SizeOfVector128 >> 1) - (((nuint)pUtf16Buffer >> 1) & (MaskOfAllBitsInVector128 >> 1));
+            Debug.Assert(0 < currentOffset && currentOffset <= SizeOfVector128 / sizeof(char));
+
+            nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;
+
+            do
+            {
+                // In a loop, perform an unaligned read, widen to two vectors, then aligned write the two vectors.
+
+                asciiVector = Sse2.LoadVector128(pAsciiBuffer + currentOffset); // unaligned load
+                mask = (uint)Sse2.MoveMask(asciiVector);
+
+                if (mask != 0)
+                {
+                    // non-ASCII byte somewhere
+                    goto NonAsciiDataSeenInInnerLoop;
+                }
+
+                byte* pStore = (byte*)(pUtf16Buffer + currentOffset);
+                Sse2.StoreAligned(pStore, Sse2.UnpackLow(asciiVector, zeroVector));
+
+                pStore += SizeOfVector128;
+                Sse2.StoreAligned(pStore, Sse2.UnpackHigh(asciiVector, zeroVector));
+
+                currentOffset += SizeOfVector128;
+            } while (currentOffset <= finalOffsetWhereCanRunLoop);
+
+        Finish:
+
+            return currentOffset;
+
+        NonAsciiDataSeenInInnerLoop:
+
+            // Can we at least widen the first part of the vector?
+
+            if (0u >= ((byte)mask))
+            {
+                // First part was all ASCII, widen
+                utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
+                Sse2.StoreAligned((byte*)(pUtf16Buffer + currentOffset), utf16FirstHalfVector);
+                currentOffset += SizeOfVector128 / 2;
+            }
+
+            goto Finish;
+        }
+
+        /// <summary>
+        /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
+        /// writes them to the output buffer with machine endianness.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value)
+        {
+            Debug.Assert(AllBytesInUInt32AreAscii(value));
+
+            if (Bmi2.X64.IsSupported)
+            {
+                // BMI2 will work regardless of the processor's endianness.
+                Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul));
+            }
+            else
+            {
+                if (BitConverter.IsLittleEndian)
+                {
+                    outputBuffer = (char)(byte)value;
+                    value >>= 8;
+                    Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
+                    value >>= 8;
+                    Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
+                    value >>= 8;
+                    Unsafe.Add(ref outputBuffer, 3) = (char)value;
+                }
+                else
+                {
+                    Unsafe.Add(ref outputBuffer, 3) = (char)(byte)value;
+                    value >>= 8;
+                    Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
+                    value >>= 8;
+                    Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
+                    value >>= 8;
+                    outputBuffer = (char)value;
+                }
+            }
+        }
+    }
+}
+#endif
diff --git a/src/DotNetty.Common/Internal/ASCIIUtility.cs b/src/DotNetty.Common/Internal/ASCIIUtility.cs
index cb419bb84..5427bdb00 100644
--- a/src/DotNetty.Common/Internal/ASCIIUtility.cs
+++ b/src/DotNetty.Common/Internal/ASCIIUtility.cs
@@ -1,4 +1,4 @@
-﻿// borrowed from https://github.com/dotnet/corefx/blob/release/3.1/src/Common/src/CoreLib/System/Text/cs
+﻿// borrowed from https://github.com/dotnet/corefx/blob/release/3.1/src/Common/src/CoreLib/System/Text/ASCIIUtility.cs
 
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
@@ -11,6 +11,9 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
+#if NET
+using System.Runtime.Intrinsics.Arm;
+#endif
 
 namespace DotNetty.Common.Internal
 {
@@ -21,7 +24,7 @@ private static bool AllBytesInUInt64AreAscii(ulong value)
         {
             // If the high bit of any byte is set, that byte is non-ASCII.
 
-            return (0ul >= (value & UInt64HighBitsOnlyMask));
+            return 0ul >= (value & UInt64HighBitsOnlyMask);
         }
 
         /// <summary>
@@ -30,7 +33,7 @@ private static bool AllBytesInUInt64AreAscii(ulong value)
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static bool AllCharsInUInt32AreAscii(uint value)
         {
-            return (0u >= (value & ~0x007F007Fu));
+            return 0u >= (value & ~0x007F007Fu);
         }
 
         /// <summary>
@@ -39,7 +42,7 @@ private static bool AllCharsInUInt32AreAscii(uint value)
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static bool AllCharsInUInt64AreAscii(ulong value)
         {
-            return (0ul >= (value & ~0x007F007F_007F007Ful));
+            return 0ul >= (value & ~0x007F007F_007F007Ful);
         }
 
         /// <summary>
@@ -54,24 +57,6 @@ private static bool FirstCharInUInt32IsAscii(uint value)
                 || (!BitConverter.IsLittleEndian && 0u >= (value & 0xFF800000u));
         }
 
-        /// <summary>
-        /// Returns the index in <paramref name="pBuffer"/> where the first non-ASCII byte is found.
-        /// Returns <paramref name="bufferLength"/> if the buffer is empty or all-ASCII.
-        /// </summary>
-        /// <returns>An ASCII byte is defined as 0x00 - 0x7F, inclusive.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bufferLength)
-        {
-            // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
-            // code below. This has two benefits: (a) we can take advantage of specific instructions like
-            // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
-            // this method is running.
-
-            return (Sse2.IsSupported)
-                ? GetIndexOfFirstNonAsciiByte_Sse2(pBuffer, bufferLength)
-                : GetIndexOfFirstNonAsciiByte_Default(pBuffer, bufferLength);
-        }
-
         private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, nuint bufferLength)
         {
             // Squirrel away the original buffer reference. This method works by determining the exact
@@ -215,267 +200,6 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, n
             goto Finish;
         }
 
-        private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuint bufferLength)
-        {
-            // JIT turns the below into constants
-
-            uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
-            nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
-
-            Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
-            Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
-
-            uint currentMask, secondMask;
-            byte* pOriginalBuffer = pBuffer;
-
-            // This method is written such that control generally flows top-to-bottom, avoiding
-            // jumps as much as possible in the optimistic case of a large enough buffer and
-            // "all ASCII". If we see non-ASCII data, we jump out of the hot paths to targets
-            // after all the main logic.
-
-            if (bufferLength < SizeOfVector128)
-            {
-                goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
-            }
-
-            // Read the first vector unaligned.
-
-            currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
-
-            if (currentMask != 0)
-            {
-                goto FoundNonAsciiDataInCurrentMask;
-            }
-
-            // If we have less than 32 bytes to process, just go straight to the final unaligned
-            // read. There's no need to mess with the loop logic in the middle of this method.
-
-            if (bufferLength < 2 * SizeOfVector128)
-            {
-                goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
-            }
-
-            // Now adjust the read pointer so that future reads are aligned.
-
-            pBuffer = (byte*)(((nuint)pBuffer + SizeOfVector128) & ~(nuint)MaskOfAllBitsInVector128);
-
-#if DEBUG
-            long numBytesRead = pBuffer - pOriginalBuffer;
-            Debug.Assert(0 < numBytesRead && numBytesRead <= SizeOfVector128, "We should've made forward progress of at least one byte.");
-            Debug.Assert((nuint)numBytesRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
-#endif
-
-            // Adjust the remaining length to account for what we just read.
-
-            bufferLength += (nuint)pOriginalBuffer;
-            bufferLength -= (nuint)pBuffer;
-
-            // The buffer is now properly aligned.
-            // Read 2 vectors at a time if possible.
-
-            if (bufferLength >= 2 * SizeOfVector128)
-            {
-                byte* pFinalVectorReadPos = (byte*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128);
-
-                // After this point, we no longer need to update the bufferLength value.
-
-                do
-                {
-                    Vector128<byte> firstVector = Sse2.LoadAlignedVector128(pBuffer);
-                    Vector128<byte> secondVector = Sse2.LoadAlignedVector128(pBuffer + SizeOfVector128);
-
-                    currentMask = (uint)Sse2.MoveMask(firstVector);
-                    secondMask = (uint)Sse2.MoveMask(secondVector);
-
-                    if ((currentMask | secondMask) != 0)
-                    {
-                        goto FoundNonAsciiDataInInnerLoop;
-                    }
-
-                    pBuffer += 2 * SizeOfVector128;
-                } while (pBuffer <= pFinalVectorReadPos);
-            }
-
-            // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
-            // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
-            // But we _can_ rely on it to tell us how much remaining data must be drained by looking
-            // at what bits of it are set. This works because had we updated it within the loop above,
-            // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
-            // bits which are less significant than those that the addition would've acted on.
-
-            // If there is fewer than one vector length remaining, skip the next aligned read.
-
-            if (0ul >= (bufferLength & SizeOfVector128))
-            {
-                goto DoFinalUnalignedVectorRead;
-            }
-
-            // At least one full vector's worth of data remains, so we can safely read it.
-            // Remember, at this point pBuffer is still aligned.
-
-            currentMask = (uint)Sse2.MoveMask(Sse2.LoadAlignedVector128(pBuffer));
-            if (currentMask != 0)
-            {
-                goto FoundNonAsciiDataInCurrentMask;
-            }
-
-        IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
-
-            pBuffer += SizeOfVector128;
-
-        DoFinalUnalignedVectorRead:
-
-            if (((byte)bufferLength & MaskOfAllBitsInVector128) != 0)
-            {
-                // Perform an unaligned read of the last vector.
-                // We need to adjust the pointer because we're re-reading data.
-
-                pBuffer += (bufferLength & MaskOfAllBitsInVector128) - SizeOfVector128;
-
-                currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load
-                if (currentMask != 0)
-                {
-                    goto FoundNonAsciiDataInCurrentMask;
-                }
-
-                pBuffer += SizeOfVector128;
-            }
-
-        Finish:
-
-            return (nuint)pBuffer - (nuint)pOriginalBuffer; // and we're done!
-
-        FoundNonAsciiDataInInnerLoop:
-
-            // If the current (first) mask isn't the mask that contains non-ASCII data, then it must
-            // instead be the second mask. If so, skip the entire first mask and drain ASCII bytes
-            // from the second mask.
-
-            if (0u >= currentMask)
-            {
-                pBuffer += SizeOfVector128;
-                currentMask = secondMask;
-            }
-
-        FoundNonAsciiDataInCurrentMask:
-
-            // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
-            // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
-            // available, we'll fall back to a normal loop.
-
-            Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
-            pBuffer += (uint)BitOperations.TrailingZeroCount(currentMask);
-
-            goto Finish;
-
-        FoundNonAsciiDataInCurrentDWord:
-
-            uint currentDWord;
-            Debug.Assert(!AllBytesInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
-            pBuffer += CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(currentDWord);
-
-            goto Finish;
-
-        InputBufferLessThanOneVectorInLength:
-
-            // These code paths get hit if the original input length was less than one vector in size.
-            // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
-            // directly. Note that all of these reads are unaligned.
-
-            Debug.Assert(bufferLength < SizeOfVector128);
-
-            // QWORD drain
-
-            if ((bufferLength & 8) != 0)
-            {
-                if (Bmi1.X64.IsSupported)
-                {
-                    // If we can use 64-bit tzcnt to count the number of leading ASCII bytes, prefer it.
-
-                    ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
-                    if (!AllBytesInUInt64AreAscii(candidateUInt64))
-                    {
-                        // Clear everything but the high bit of each byte, then tzcnt.
-                        // Remember the / 8 at the end to convert bit count to byte count.
-
-                        candidateUInt64 &= UInt64HighBitsOnlyMask;
-                        pBuffer += (nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8);
-                        goto Finish;
-                    }
-                }
-                else
-                {
-                    // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.
-
-                    currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
-                    uint nextDWord = Unsafe.ReadUnaligned<uint>(pBuffer + 4);
-
-                    if (!AllBytesInUInt32AreAscii(currentDWord | nextDWord))
-                    {
-                        // At least one of the values wasn't all-ASCII.
-                        // We need to figure out which one it was and stick it in the currentMask local.
-
-                        if (AllBytesInUInt32AreAscii(currentDWord))
-                        {
-                            currentDWord = nextDWord; // this one is the culprit
-                            pBuffer += 4;
-                        }
-
-                        goto FoundNonAsciiDataInCurrentDWord;
-                    }
-                }
-
-                pBuffer += 8; // successfully consumed 8 ASCII bytes
-            }
-
-            // DWORD drain
-
-            if ((bufferLength & 4) != 0)
-            {
-                currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
-
-                if (!AllBytesInUInt32AreAscii(currentDWord))
-                {
-                    goto FoundNonAsciiDataInCurrentDWord;
-                }
-
-                pBuffer += 4; // successfully consumed 4 ASCII bytes
-            }
-
-            // WORD drain
-            // (We movzx to a DWORD for ease of manipulation.)
-
-            if ((bufferLength & 2) != 0)
-            {
-                currentDWord = Unsafe.ReadUnaligned<ushort>(pBuffer);
-
-                if (!AllBytesInUInt32AreAscii(currentDWord))
-                {
-                    // We only care about the 0x0080 bit of the value. If it's not set, then we
-                    // increment currentOffset by 1. If it's set, we don't increment it at all.
-
-                    pBuffer += (nuint)((nint)(sbyte)currentDWord >> 7) + 1;
-                    goto Finish;
-                }
-
-                pBuffer += 2; // successfully consumed 2 ASCII bytes
-            }
-
-            // BYTE drain
-
-            if ((bufferLength & 1) != 0)
-            {
-                // sbyte has non-negative value if byte is ASCII.
-
-                if (*(sbyte*)(pBuffer) >= 0)
-                {
-                    pBuffer++; // successfully consumed a single byte
-                }
-            }
-
-            goto Finish;
-        }
-
         /// <summary>
         /// Returns the index in <paramref name="pBuffer"/> where the first non-ASCII char is found.
         /// Returns <paramref name="bufferLength"/> if the buffer is empty or all-ASCII.
@@ -630,476 +354,137 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Default(char* pBuffer, n
             goto Finish;
         }
 
-        private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuint bufferLength /* in chars */)
+        /// <summary>
+        /// Given a QWORD which represents a buffer of 4 ASCII chars in machine-endian order,
+        /// narrows each WORD to a BYTE, then writes the 4-byte result to the output buffer
+        /// also in machine-endian order.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static void NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref byte outputBuffer, ulong value)
         {
-            // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
-            // will be elided by JIT once we determine which specific ISAs we support.
-
-            // Quick check for empty inputs.
+            Debug.Assert(AllCharsInUInt64AreAscii(value));
 
-            if (0ul >= bufferLength)
+#if NETCOREAPP3_1
+            if (Bmi2.X64.IsSupported)
             {
-                return 0;
+                // BMI2 will work regardless of the processor's endianness.
+                Unsafe.WriteUnaligned(ref outputBuffer, (uint)Bmi2.X64.ParallelBitExtract(value, 0x00FF00FF_00FF00FFul));
             }
-
-            // JIT turns the below into constants
-
-            uint SizeOfVector128InBytes = (uint)Unsafe.SizeOf<Vector128<byte>>();
-            uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char);
-
-            Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
-            Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
-
-            Vector128<short> firstVector, secondVector;
-            uint currentMask;
-            char* pOriginalBuffer = pBuffer;
-
-            if (bufferLength < SizeOfVector128InChars)
+#else
+            if (Sse2.X64.IsSupported)
             {
-                goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
-            }
-
-            // This method is written such that control generally flows top-to-bottom, avoiding
-            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
-            // data, we jump out of the hot paths to targets at the end of the method.
-
-            Vector128<short> asciiMaskForPTEST = Vector128.Create(unchecked((short)0xFF80)); // used for PTEST on supported hardware
-            Vector128<ushort> asciiMaskForPMINUW = Vector128.Create((ushort)0x0080); // used for PMINUW on supported hardware
-            Vector128<short> asciiMaskForPXOR = Vector128.Create(unchecked((short)0x8000)); // used for PXOR
-            Vector128<short> asciiMaskForPCMPGTW = Vector128.Create(unchecked((short)0x807F)); // used for PCMPGTW
-
-#if NET
-            Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
-#endif
-
-            // Read the first vector unaligned.
+                // Narrows a vector of words [ w0 w1 w2 w3 ] to a vector of bytes
+                // [ b0 b1 b2 b3 b0 b1 b2 b3 ], then writes 4 bytes (32 bits) to the destination.
 
-            firstVector = Sse2.LoadVector128((short*)pBuffer); // unaligned load
-
-            if (Sse41.IsSupported)
+                Vector128<short> vecWide = Sse2.X64.ConvertScalarToVector128UInt64(value).AsInt16();
+                Vector128<uint> vecNarrow = Sse2.PackUnsignedSaturate(vecWide, vecWide).AsUInt32();
+                Unsafe.WriteUnaligned<uint>(ref outputBuffer, Sse2.ConvertToUInt32(vecNarrow));
+            }
+            else if (AdvSimd.IsSupported)
             {
-                // The SSE41-optimized code path works by forcing the 0x0080 bit in each WORD of the vector to be
-                // set iff the WORD element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector
-                // in order to extract the mask.
-                currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
+                // Narrows a vector of words [ w0 w1 w2 w3 ] to a vector of bytes
+                // [ b0 b1 b2 b3 * * * * ], then writes 4 bytes (32 bits) to the destination.
+
+                Vector128<short> vecWide = Vector128.CreateScalarUnsafe(value).AsInt16();
+                Vector64<byte> lower = AdvSimd.ExtractNarrowingSaturateUnsignedLower(vecWide);
+                Unsafe.WriteUnaligned<uint>(ref outputBuffer, lower.AsUInt32().ToScalar());
             }
+#endif
             else
             {
-                // The SSE2-optimized code path works by forcing each WORD of the vector to be 0xFFFF iff the WORD
-                // element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector in order to extract
-                // the mask.
-                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
+                if (BitConverter.IsLittleEndian)
+                {
+                    outputBuffer = (byte)value;
+                    value >>= 16;
+                    Unsafe.Add(ref outputBuffer, 1) = (byte)value;
+                    value >>= 16;
+                    Unsafe.Add(ref outputBuffer, 2) = (byte)value;
+                    value >>= 16;
+                    Unsafe.Add(ref outputBuffer, 3) = (byte)value;
+                }
+                else
+                {
+                    Unsafe.Add(ref outputBuffer, 3) = (byte)value;
+                    value >>= 16;
+                    Unsafe.Add(ref outputBuffer, 2) = (byte)value;
+                    value >>= 16;
+                    Unsafe.Add(ref outputBuffer, 1) = (byte)value;
+                    value >>= 16;
+                    outputBuffer = (byte)value;
+                }
             }
+        }
+
+        /// <summary>
+        /// Given a DWORD which represents a buffer of 2 ASCII chars in machine-endian order,
+        /// narrows each WORD to a BYTE, then writes the 2-byte result to the output buffer also in
+        /// machine-endian order.
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static void NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref byte outputBuffer, uint value)
+        {
+            Debug.Assert(AllCharsInUInt32AreAscii(value));
 
-            if (currentMask != 0)
+            if (BitConverter.IsLittleEndian)
             {
-                goto FoundNonAsciiDataInCurrentMask;
+                outputBuffer = (byte)value;
+                Unsafe.Add(ref outputBuffer, 1) = (byte)(value >> 16);
             }
-
-            // If we have less than 32 bytes to process, just go straight to the final unaligned
-            // read. There's no need to mess with the loop logic in the middle of this method.
-
-            // Adjust the remaining length to account for what we just read.
-            // For the remainder of this code path, bufferLength will be in bytes, not chars.
-
-            bufferLength <<= 1; // chars to bytes
-
-            if (bufferLength < 2 * SizeOfVector128InBytes)
+            else
             {
-                goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
+                Unsafe.Add(ref outputBuffer, 1) = (byte)value;
+                outputBuffer = (byte)(value >> 16);
             }
+        }
 
-            // Now adjust the read pointer so that future reads are aligned.
-
-            pBuffer = (char*)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1));
-
-#if DEBUG
-            long numCharsRead = pBuffer - pOriginalBuffer;
-            Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector128InChars, "We should've made forward progress of at least one char.");
-            Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
-#endif
-
-            // Adjust remaining buffer length.
+        /// <summary>
+        /// Copies as many ASCII characters (U+0000..U+007F) as possible from <paramref name="pUtf16Buffer"/>
+        /// to <paramref name="pAsciiBuffer"/>, stopping when the first non-ASCII character is encountered
+        /// or once <paramref name="elementCount"/> elements have been converted. Returns the total number
+        /// of elements that were able to be converted.
+        /// </summary>
+        public static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
+        {
+            nuint currentOffset = 0;
 
-            bufferLength += (nuint)pOriginalBuffer;
-            bufferLength -= (nuint)pBuffer;
+            uint utf16Data32BitsHigh = 0, utf16Data32BitsLow = 0;
+            ulong utf16Data64Bits = 0;
 
-            // The buffer is now properly aligned.
-            // Read 2 vectors at a time if possible.
+            // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
+            // code below. This has two benefits: (a) we can take advantage of specific instructions like
+            // pmovmskb, ptest, vpminuw which we know are optimized, and (b) we can avoid downclocking the
+            // processor while this method is running.
 
-            if (bufferLength >= 2 * SizeOfVector128InBytes)
+            if (Sse2.IsSupported)
             {
-                char* pFinalVectorReadPos = (char*)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes);
-
-                // After this point, we no longer need to update the bufferLength value.
+                Debug.Assert(BitConverter.IsLittleEndian, "Assume little endian if SSE2 is supported.");
 
-                do
+                if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
                 {
-                    firstVector = Sse2.LoadAlignedVector128((short*)pBuffer);
-                    secondVector = Sse2.LoadAlignedVector128((short*)pBuffer + SizeOfVector128InChars);
-                    Vector128<short> combinedVector = Sse2.Or(firstVector, secondVector);
+                    // Since there's overhead to setting up the vectorized code path, we only want to
+                    // call into it after a quick probe to ensure the next immediate characters really are ASCII.
+                    // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method.
 
-                    if (Sse41.IsSupported)
+                    if (PlatformDependent.Is64BitProcess)
                     {
-                        // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
-                        // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
-                        if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
+                        utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer);
+                        if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
                         {
-                            goto FoundNonAsciiDataInFirstOrSecondVector;
+                            goto FoundNonAsciiDataIn64BitRead;
                         }
                     }
                     else
                     {
-                        // See comment earlier in the method for an explanation of how the below logic works.
-                        if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
+                        utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer);
+                        utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + 4 / sizeof(char));
+                        if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
                         {
-                            goto FoundNonAsciiDataInFirstOrSecondVector;
+                            goto FoundNonAsciiDataIn64BitRead;
                         }
                     }
 
-                    pBuffer += 2 * SizeOfVector128InChars;
-                } while (pBuffer <= pFinalVectorReadPos);
-            }
-
-            // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
-            // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
-            // But we _can_ rely on it to tell us how much remaining data must be drained by looking
-            // at what bits of it are set. This works because had we updated it within the loop above,
-            // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
-            // bits which are less significant than those that the addition would've acted on.
-
-            // If there is fewer than one vector length remaining, skip the next aligned read.
-            // Remember, at this point bufferLength is measured in bytes, not chars.
-
-            if (0ul >= (bufferLength & SizeOfVector128InBytes))
-            {
-                goto DoFinalUnalignedVectorRead;
-            }
-
-            // At least one full vector's worth of data remains, so we can safely read it.
-            // Remember, at this point pBuffer is still aligned.
-
-            firstVector = Sse2.LoadAlignedVector128((short*)pBuffer);
-
-            if (Sse41.IsSupported)
-            {
-                // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
-                // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
-                if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
-                {
-                    goto FoundNonAsciiDataInFirstVector;
-                }
-            }
-            else
-            {
-                // See comment earlier in the method for an explanation of how the below logic works.
-                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
-                if (currentMask != 0)
-                {
-                    goto FoundNonAsciiDataInCurrentMask;
-                }
-            }
-
-        IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:
-
-            pBuffer += SizeOfVector128InChars;
-
-        DoFinalUnalignedVectorRead:
-
-            if (((byte)bufferLength & (SizeOfVector128InBytes - 1)) != 0)
-            {
-                // Perform an unaligned read of the last vector.
-                // We need to adjust the pointer because we're re-reading data.
-
-                pBuffer = (char*)((byte*)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes);
-                firstVector = Sse2.LoadVector128((short*)pBuffer); // unaligned load
-
-                if (Sse41.IsSupported)
-                {
-                    // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
-                    // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
-                    if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
-                    {
-                        goto FoundNonAsciiDataInFirstVector;
-                    }
-                }
-                else
-                {
-                    // See comment earlier in the method for an explanation of how the below logic works.
-                    currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
-                    if (currentMask != 0)
-                    {
-                        goto FoundNonAsciiDataInCurrentMask;
-                    }
-                }
-
-                pBuffer += SizeOfVector128InChars;
-            }
-
-        Finish:
-
-            Debug.Assert(((nuint)pBuffer - (nuint)pOriginalBuffer) % 2 == 0, "Shouldn't have incremented any pointer by an odd byte count.");
-            return ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char); // and we're done! (remember to adjust for char count)
-
-        FoundNonAsciiDataInFirstOrSecondVector:
-
-            // We don't know if the first or the second vector contains non-ASCII data. Check the first
-            // vector, and if that's all-ASCII then the second vector must be the culprit. Either way
-            // we'll make sure the first vector local is the one that contains the non-ASCII data.
-
-            // See comment earlier in the method for an explanation of how the below logic works.
-            if (Sse41.IsSupported)
-            {
-                if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
-                {
-                    goto FoundNonAsciiDataInFirstVector;
-                }
-            }
-            else
-            {
-                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
-                if (currentMask != 0)
-                {
-                    goto FoundNonAsciiDataInCurrentMask;
-                }
-            }
-
-            // Wasn't the first vector; must be the second.
-
-            pBuffer += SizeOfVector128InChars;
-            firstVector = secondVector;
-
-        FoundNonAsciiDataInFirstVector:
-
-            // See comment earlier in the method for an explanation of how the below logic works.
-            if (Sse41.IsSupported)
-            {
-                currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
-            }
-            else
-            {
-                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
-            }
-
-        FoundNonAsciiDataInCurrentMask:
-
-            // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
-            // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
-            // available, we'll fall back to a normal loop. (Even though the original vector used WORD elements,
-            // masks work on BYTE elements, and we account for this in the final fixup.)
-
-            Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
-            pBuffer = (char*)((byte*)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask));
-
-            goto Finish;
-
-        FoundNonAsciiDataInCurrentDWord:
-
-            uint currentDWord;
-            Debug.Assert(!AllCharsInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");
-
-            if (FirstCharInUInt32IsAscii(currentDWord))
-            {
-                pBuffer++; // skip past the ASCII char
-            }
-
-            goto Finish;
-
-        InputBufferLessThanOneVectorInLength:
-
-            // These code paths get hit if the original input length was less than one vector in size.
-            // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
-            // directly. Note that all of these reads are unaligned.
-
-            // Reminder: If this code path is hit, bufferLength is still a char count, not a byte count.
-            // We skipped the code path that multiplied the count by sizeof(char).
-
-            Debug.Assert(bufferLength < SizeOfVector128InChars);
-
-            // QWORD drain
-
-            if ((bufferLength & 4) != 0)
-            {
-                if (Bmi1.X64.IsSupported)
-                {
-                    // If we can use 64-bit tzcnt to count the number of leading ASCII chars, prefer it.
-
-                    ulong candidateUInt64 = Unsafe.ReadUnaligned<ulong>(pBuffer);
-                    if (!AllCharsInUInt64AreAscii(candidateUInt64))
-                    {
-                        // Clear the low 7 bits (the ASCII bits) of each char, then tzcnt.
-                        // Remember the / 8 at the end to convert bit count to byte count,
-                        // then the & ~1 at the end to treat a match in the high byte of
-                        // any char the same as a match in the low byte of that same char.
-
-                        candidateUInt64 &= 0xFF80FF80_FF80FF80ul;
-                        pBuffer = (char*)((byte*)pBuffer + ((nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8) & ~(nuint)1));
-                        goto Finish;
-                    }
-                }
-                else
-                {
-                    // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.
-
-                    currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
-                    uint nextDWord = Unsafe.ReadUnaligned<uint>(pBuffer + 4 / sizeof(char));
-
-                    if (!AllCharsInUInt32AreAscii(currentDWord | nextDWord))
-                    {
-                        // At least one of the values wasn't all-ASCII.
-                        // We need to figure out which one it was and stick it in the currentMask local.
-
-                        if (AllCharsInUInt32AreAscii(currentDWord))
-                        {
-                            currentDWord = nextDWord; // this one is the culprit
-                            pBuffer += 4 / sizeof(char);
-                        }
-
-                        goto FoundNonAsciiDataInCurrentDWord;
-                    }
-                }
-
-                pBuffer += 4; // successfully consumed 4 ASCII chars
-            }
-
-            // DWORD drain
-
-            if ((bufferLength & 2) != 0)
-            {
-                currentDWord = Unsafe.ReadUnaligned<uint>(pBuffer);
-
-                if (!AllCharsInUInt32AreAscii(currentDWord))
-                {
-                    goto FoundNonAsciiDataInCurrentDWord;
-                }
-
-                pBuffer += 2; // successfully consumed 2 ASCII chars
-            }
-
-            // WORD drain
-            // This is the final drain; there's no need for a BYTE drain since our elemental type is 16-bit char.
-
-            if ((bufferLength & 1) != 0)
-            {
-                if (*pBuffer <= 0x007F)
-                {
-                    pBuffer++; // successfully consumed a single char
-                }
-            }
-
-            goto Finish;
-        }
-
-        /// <summary>
-        /// Given a QWORD which represents a buffer of 4 ASCII chars in machine-endian order,
-        /// narrows each WORD to a BYTE, then writes the 4-byte result to the output buffer
-        /// also in machine-endian order.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref byte outputBuffer, ulong value)
-        {
-            Debug.Assert(AllCharsInUInt64AreAscii(value));
-
-            if (Bmi2.X64.IsSupported)
-            {
-                // BMI2 will work regardless of the processor's endianness.
-                Unsafe.WriteUnaligned(ref outputBuffer, (uint)Bmi2.X64.ParallelBitExtract(value, 0x00FF00FF_00FF00FFul));
-            }
-            else
-            {
-                if (BitConverter.IsLittleEndian)
-                {
-                    outputBuffer = (byte)value;
-                    value >>= 16;
-                    Unsafe.Add(ref outputBuffer, 1) = (byte)value;
-                    value >>= 16;
-                    Unsafe.Add(ref outputBuffer, 2) = (byte)value;
-                    value >>= 16;
-                    Unsafe.Add(ref outputBuffer, 3) = (byte)value;
-                }
-                else
-                {
-                    Unsafe.Add(ref outputBuffer, 3) = (byte)value;
-                    value >>= 16;
-                    Unsafe.Add(ref outputBuffer, 2) = (byte)value;
-                    value >>= 16;
-                    Unsafe.Add(ref outputBuffer, 1) = (byte)value;
-                    value >>= 16;
-                    outputBuffer = (byte)value;
-                }
-            }
-        }
-
-        /// <summary>
-        /// Given a DWORD which represents a buffer of 2 ASCII chars in machine-endian order,
-        /// narrows each WORD to a BYTE, then writes the 2-byte result to the output buffer also in
-        /// machine-endian order.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref byte outputBuffer, uint value)
-        {
-            Debug.Assert(AllCharsInUInt32AreAscii(value));
-
-            if (BitConverter.IsLittleEndian)
-            {
-                outputBuffer = (byte)value;
-                Unsafe.Add(ref outputBuffer, 1) = (byte)(value >> 16);
-            }
-            else
-            {
-                Unsafe.Add(ref outputBuffer, 1) = (byte)value;
-                outputBuffer = (byte)(value >> 16);
-            }
-        }
-
-        /// <summary>
-        /// Copies as many ASCII characters (U+0000..U+007F) as possible from <paramref name="pUtf16Buffer"/>
-        /// to <paramref name="pAsciiBuffer"/>, stopping when the first non-ASCII character is encountered
-        /// or once <paramref name="elementCount"/> elements have been converted. Returns the total number
-        /// of elements that were able to be converted.
-        /// </summary>
-        public static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
-        {
-            nuint currentOffset = 0;
-
-            uint utf16Data32BitsHigh = 0, utf16Data32BitsLow = 0;
-            ulong utf16Data64Bits = 0;
-
-            // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
-            // code below. This has two benefits: (a) we can take advantage of specific instructions like
-            // pmovmskb, ptest, vpminuw which we know are optimized, and (b) we can avoid downclocking the
-            // processor while this method is running.
-
-            if (Sse2.IsSupported)
-            {
-                Debug.Assert(BitConverter.IsLittleEndian, "Assume little endian if SSE2 is supported.");
-
-                if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
-                {
-                    // Since there's overhead to setting up the vectorized code path, we only want to
-                    // call into it after a quick probe to ensure the next immediate characters really are ASCII.
-                    // If we see non-ASCII data, we'll jump immediately to the draining logic at the end of the method.
-
-                    if (PlatformDependent.Is64BitProcess)
-                    {
-                        utf16Data64Bits = Unsafe.ReadUnaligned<ulong>(pUtf16Buffer);
-                        if (!AllCharsInUInt64AreAscii(utf16Data64Bits))
-                        {
-                            goto FoundNonAsciiDataIn64BitRead;
-                        }
-                    }
-                    else
-                    {
-                        utf16Data32BitsHigh = Unsafe.ReadUnaligned<uint>(pUtf16Buffer);
-                        utf16Data32BitsLow = Unsafe.ReadUnaligned<uint>(pUtf16Buffer + 4 / sizeof(char));
-                        if (!AllCharsInUInt32AreAscii(utf16Data32BitsHigh | utf16Data32BitsLow))
-                        {
-                            goto FoundNonAsciiDataIn64BitRead;
-                        }
-                    }
-
-                    currentOffset = NarrowUtf16ToAscii_Sse2(pUtf16Buffer, pAsciiBuffer, elementCount);
-                }
+                    currentOffset = NarrowUtf16ToAscii_Sse2(pUtf16Buffer, pAsciiBuffer, elementCount);
+                }
             }
             else if (Vector.IsHardwareAccelerated)
             {
@@ -1285,439 +670,6 @@ public static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAsciiBu
 
             goto Finish;
         }
-
-        private static unsafe nuint NarrowUtf16ToAscii_Sse2(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount)
-        {
-            // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
-            // will be elided by JIT once we determine which specific ISAs we support.
-
-            // JIT turns the below into constants
-
-            uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
-            nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
-
-            // This method is written such that control generally flows top-to-bottom, avoiding
-            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
-            // data, we jump out of the hot paths to targets at the end of the method.
-
-            Debug.Assert(Sse2.IsSupported);
-            Debug.Assert(BitConverter.IsLittleEndian);
-            Debug.Assert(elementCount >= 2 * SizeOfVector128);
-
-            Vector128<short> asciiMaskForPTEST = Vector128.Create(unchecked((short)0xFF80)); // used for PTEST on supported hardware
-            Vector128<short> asciiMaskForPXOR = Vector128.Create(unchecked((short)0x8000)); // used for PXOR
-            Vector128<short> asciiMaskForPCMPGTW = Vector128.Create(unchecked((short)0x807F)); // used for PCMPGTW
-
-            // First, perform an unaligned read of the first part of the input buffer.
-
-            Vector128<short> utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer); // unaligned load
-
-            // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
-            // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
-
-            if (Sse41.IsSupported)
-            {
-                if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
-                {
-                    return 0;
-                }
-            }
-            else
-            {
-                if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
-                {
-                    return 0;
-                }
-            }
-
-            // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
-
-            Vector128<byte> asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
-            Sse2.StoreScalar((ulong*)pAsciiBuffer, asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
-
-            nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far
-
-            // We're going to get the best performance when we have aligned writes, so we'll take the
-            // hit of potentially unaligned reads in order to hit this sweet spot.
-
-            // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote
-            // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote
-            // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In
-            // that case we can immediately back up to the previous aligned boundary and start the main loop.
-            // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at
-            // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump
-            // just past the next aligned boundary address.
-
-            if (0u >= ((uint)pAsciiBuffer & (SizeOfVector128 / 2)))
-            {
-                // We need to perform one more partial vector write before we can get the alignment we want.
-
-                utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements); // unaligned load
-
-                // See comments earlier in this method for information about how this works.
-                if (Sse41.IsSupported)
-                {
-                    if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
-                    {
-                        goto Finish;
-                    }
-                }
-                else
-                {
-                    if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
-                    {
-                        goto Finish;
-                    }
-                }
-
-                // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
-                asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
-                Sse2.StoreScalar((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
-            }
-
-            // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
-            // point, then use that as the base offset going forward.
-
-            currentOffsetInElements = SizeOfVector128 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector128);
-            Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector128, "We wrote at least 1 byte but no more than a whole vector.");
-
-            Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer.");
-            Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector128, "We should be able to run at least one whole vector.");
-
-            nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;
-            do
-            {
-                // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector.
-
-                utf16VectorFirst = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements); // unaligned load
-                Vector128<short> utf16VectorSecond = Sse2.LoadVector128((short*)pUtf16Buffer + currentOffsetInElements + SizeOfVector128 / sizeof(short)); // unaligned load
-                Vector128<short> combinedVector = Sse2.Or(utf16VectorFirst, utf16VectorSecond);
-
-                // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
-                if (Sse41.IsSupported)
-                {
-                    if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
-                    {
-                        goto FoundNonAsciiDataInLoop;
-                    }
-                }
-                else
-                {
-                    if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
-                    {
-                        goto FoundNonAsciiDataInLoop;
-                    }
-                }
-
-                // Build up the UTF-8 vector and perform the store.
-
-                asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorSecond);
-
-                Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector128 == 0, "Write should be aligned.");
-                Sse2.StoreAligned(pAsciiBuffer + currentOffsetInElements, asciiVector); // aligned
-
-                currentOffsetInElements += SizeOfVector128;
-            } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop);
-
-        Finish:
-
-            // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain.
-            return currentOffsetInElements;
-
-        FoundNonAsciiDataInLoop:
-
-            // Can we at least narrow the high vector?
-            // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
-            if (Sse41.IsSupported)
-            {
-                if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
-                {
-                    goto Finish; // found non-ASCII data
-                }
-            }
-            else
-            {
-                if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
-                {
-                    goto Finish; // found non-ASCII data
-                }
-            }
-
-            // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector.
-            asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
-
-            Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");
-
-            Sse2.StoreScalar((ulong*)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is aligned
-            currentOffsetInElements += SizeOfVector128 / 2;
-
-            goto Finish;
-        }
-
-        /// <summary>
-        /// Copies as many ASCII bytes (00..7F) as possible from <paramref name="pAsciiBuffer"/>
-        /// to <paramref name="pUtf16Buffer"/>, stopping when the first non-ASCII byte is encountered
-        /// or once <paramref name="elementCount"/> elements have been converted. Returns the total number
-        /// of elements that were able to be converted.
-        /// </summary>
-        public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
-        {
-            nuint currentOffset = 0;
-
-            // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
-            // code below. This has two benefits: (a) we can take advantage of specific instructions like
-            // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
-            // this method is running.
-
-            if (Sse2.IsSupported)
-            {
-                if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
-                {
-                    currentOffset = WidenAsciiToUtf16_Sse2(pAsciiBuffer, pUtf16Buffer, elementCount);
-                }
-            }
-            else if (Vector.IsHardwareAccelerated)
-            {
-                uint SizeOfVector = (uint)Unsafe.SizeOf<Vector<byte>>(); // JIT will make this a const
-
-                // Only bother vectorizing if we have enough data to do so.
-                if (elementCount >= SizeOfVector)
-                {
-                    // Note use of SBYTE instead of BYTE below; we're using the two's-complement
-                    // representation of negative integers to act as a surrogate for "is ASCII?".
-
-                    nuint finalOffsetWhereCanLoop = elementCount - SizeOfVector;
-                    do
-                    {
-                        Vector<sbyte> asciiVector = Unsafe.ReadUnaligned<Vector<sbyte>>(pAsciiBuffer + currentOffset);
-                        if (Vector.LessThanAny(asciiVector, Vector<sbyte>.Zero))
-                        {
-                            break; // found non-ASCII data
-                        }
-
-                        Vector.Widen(Vector.AsVectorByte(asciiVector), out Vector<ushort> utf16LowVector, out Vector<ushort> utf16HighVector);
-
-                        // TODO: Is the below logic also valid for big-endian platforms?
-                        Unsafe.WriteUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset, utf16LowVector);
-                        Unsafe.WriteUnaligned<Vector<ushort>>(pUtf16Buffer + currentOffset + Vector<ushort>.Count, utf16HighVector);
-
-                        currentOffset += SizeOfVector;
-                    } while (currentOffset <= finalOffsetWhereCanLoop);
-                }
-            }
-
-            Debug.Assert(currentOffset <= elementCount);
-            nuint remainingElementCount = elementCount - currentOffset;
-
-            // Try to widen 32 bits -> 64 bits at a time.
-            // We needn't update remainingElementCount after this point.
-
-            uint asciiData;
-
-            if (remainingElementCount >= 4)
-            {
-                nuint finalOffsetWhereCanLoop = currentOffset + remainingElementCount - 4;
-                do
-                {
-                    asciiData = Unsafe.ReadUnaligned<uint>(pAsciiBuffer + currentOffset);
-                    if (!AllBytesInUInt32AreAscii(asciiData))
-                    {
-                        goto FoundNonAsciiData;
-                    }
-
-                    WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pUtf16Buffer[currentOffset], asciiData);
-                    currentOffset += 4;
-                } while (currentOffset <= finalOffsetWhereCanLoop);
-            }
-
-            // Try to widen 16 bits -> 32 bits.
-
-            if (((uint)remainingElementCount & 2) != 0)
-            {
-                asciiData = Unsafe.ReadUnaligned<ushort>(pAsciiBuffer + currentOffset);
-                if (!AllBytesInUInt32AreAscii(asciiData))
-                {
-                    goto FoundNonAsciiData;
-                }
-
-                if (BitConverter.IsLittleEndian)
-                {
-                    pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
-                    pUtf16Buffer[currentOffset + 1] = (char)(asciiData >> 8);
-                }
-                else
-                {
-                    pUtf16Buffer[currentOffset + 1] = (char)(byte)asciiData;
-                    pUtf16Buffer[currentOffset] = (char)(asciiData >> 8);
-                }
-
-                currentOffset += 2;
-            }
-
-            // Try to widen 8 bits -> 16 bits.
-
-            if (((uint)remainingElementCount & 1) != 0)
-            {
-                asciiData = pAsciiBuffer[currentOffset];
-                if (((byte)asciiData & 0x80) != 0)
-                {
-                    goto Finish;
-                }
-
-                pUtf16Buffer[currentOffset] = (char)asciiData;
-                currentOffset += 1;
-            }
-
-        Finish:
-
-            return currentOffset;
-
-        FoundNonAsciiData:
-
-            Debug.Assert(!AllBytesInUInt32AreAscii(asciiData), "Shouldn't have reached this point if we have an all-ASCII input.");
-
-            // Drain ASCII bytes one at a time.
-
-            while (0u >= (uint)((byte)asciiData & 0x80))
-            {
-                pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
-                currentOffset += 1;
-                asciiData >>= 8;
-            }
-
-            goto Finish;
-        }
-
-        private static unsafe nuint WidenAsciiToUtf16_Sse2(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
-        {
-            // JIT turns the below into constants
-
-            uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
-            nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);
-
-            // This method is written such that control generally flows top-to-bottom, avoiding
-            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
-            // data, we jump out of the hot paths to targets at the end of the method.
-
-            Debug.Assert(Sse2.IsSupported);
-            Debug.Assert(BitConverter.IsLittleEndian);
-            Debug.Assert(elementCount >= 2 * SizeOfVector128);
-
-            // We're going to get the best performance when we have aligned writes, so we'll take the
-            // hit of potentially unaligned reads in order to hit this sweet spot.
-
-            Vector128<byte> asciiVector;
-            Vector128<byte> utf16FirstHalfVector;
-            uint mask;
-
-            // First, perform an unaligned read of the first part of the input buffer.
-
-            asciiVector = Sse2.LoadVector128(pAsciiBuffer); // unaligned load
-            mask = (uint)Sse2.MoveMask(asciiVector);
-
-            // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
-
-            if ((byte)mask != 0)
-            {
-                return 0;
-            }
-
-            // Then perform an unaligned write of the first part of the input buffer.
-
-            Vector128<byte> zeroVector = Vector128<byte>.Zero;
-
-            utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
-            Sse2.Store((byte*)pUtf16Buffer, utf16FirstHalfVector); // unaligned
-
-            // Calculate how many elements we wrote in order to get pOutputBuffer to its next alignment
-            // point, then use that as the base offset going forward. Remember the >> 1 to account for
-            // that we wrote chars, not bytes. This means we may re-read data in the next iteration of
-            // the loop, but this is ok.
-
-            nuint currentOffset = (SizeOfVector128 >> 1) - (((nuint)pUtf16Buffer >> 1) & (MaskOfAllBitsInVector128 >> 1));
-            Debug.Assert(0 < currentOffset && currentOffset <= SizeOfVector128 / sizeof(char));
-
-            nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;
-
-            do
-            {
-                // In a loop, perform an unaligned read, widen to two vectors, then aligned write the two vectors.
-
-                asciiVector = Sse2.LoadVector128(pAsciiBuffer + currentOffset); // unaligned load
-                mask = (uint)Sse2.MoveMask(asciiVector);
-
-                if (mask != 0)
-                {
-                    // non-ASCII byte somewhere
-                    goto NonAsciiDataSeenInInnerLoop;
-                }
-
-                byte* pStore = (byte*)(pUtf16Buffer + currentOffset);
-                Sse2.StoreAligned(pStore, Sse2.UnpackLow(asciiVector, zeroVector));
-
-                pStore += SizeOfVector128;
-                Sse2.StoreAligned(pStore, Sse2.UnpackHigh(asciiVector, zeroVector));
-
-                currentOffset += SizeOfVector128;
-            } while (currentOffset <= finalOffsetWhereCanRunLoop);
-
-        Finish:
-
-            return currentOffset;
-
-        NonAsciiDataSeenInInnerLoop:
-
-            // Can we at least widen the first part of the vector?
-
-            if (0u >= ((byte)mask))
-            {
-                // First part was all ASCII, widen
-                utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
-                Sse2.StoreAligned((byte*)(pUtf16Buffer + currentOffset), utf16FirstHalfVector);
-                currentOffset += SizeOfVector128 / 2;
-            }
-
-            goto Finish;
-        }
-
-        /// <summary>
-        /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
-        /// writes them to the output buffer with machine endianness.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputBuffer, uint value)
-        {
-            Debug.Assert(AllBytesInUInt32AreAscii(value));
-
-            if (Bmi2.X64.IsSupported)
-            {
-                // BMI2 will work regardless of the processor's endianness.
-                Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul));
-            }
-            else
-            {
-                if (BitConverter.IsLittleEndian)
-                {
-                    outputBuffer = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 3) = (char)value;
-                }
-                else
-                {
-                    Unsafe.Add(ref outputBuffer, 3) = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 2) = (char)(byte)value;
-                    value >>= 8;
-                    Unsafe.Add(ref outputBuffer, 1) = (char)(byte)value;
-                    value >>= 8;
-                    outputBuffer = (char)value;
-                }
-            }
-        }
     }
 }
 #endif
diff --git a/src/DotNetty.Common/Internal/TextEncodings.Utf16.NetCore3.cs b/src/DotNetty.Common/Internal/TextEncodings.Utf16.NetCore3.cs
index 25bf98fc1..8e4010a0f 100644
--- a/src/DotNetty.Common/Internal/TextEncodings.Utf16.NetCore3.cs
+++ b/src/DotNetty.Common/Internal/TextEncodings.Utf16.NetCore3.cs
@@ -70,6 +70,98 @@ private static unsafe int GetBytesFastInternal(char* pChars, int charsLength, by
                 charsConsumed = (int)(pInputBufferRemaining - pChars);
                 return (int)(pOutputBufferRemaining - pBytes);
             }
+
+
+            /// <summary>
+            /// Transcodes the UTF-16 <paramref name="source"/> buffer to <paramref name="destination"/> as UTF-8.
+            /// </summary>
+            /// <remarks>
+            /// If <paramref name="replaceInvalidSequences"/> is <see langword="true"/>, invalid UTF-16 sequences
+            /// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and
+            /// this method will not return <see cref="OperationStatus.InvalidData"/>.
+            /// </remarks>
+            public static unsafe OperationStatus ToUtf8(ReadOnlySpan<char> source, Span<byte> destination, out int charsRead, out int bytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
+            {
+                // Throwaway span accesses - workaround for https://github.com/dotnet/coreclr/issues/12332
+
+                _ = source.Length;
+                _ = destination.Length;
+
+                fixed (char* pOriginalSource = &MemoryMarshal.GetReference(source))
+                fixed (byte* pOriginalDestination = &MemoryMarshal.GetReference(destination))
+                {
+                    // We're going to bulk transcode as much as we can in a loop, iterating
+                    // every time we see bad data that requires replacement.
+
+                    OperationStatus operationStatus = OperationStatus.Done;
+                    char* pInputBufferRemaining = pOriginalSource;
+                    byte* pOutputBufferRemaining = pOriginalDestination;
+
+                    while (!source.IsEmpty)
+                    {
+                        // We've pinned the spans at the entry point to this method.
+                        // It's safe for us to use Unsafe.AsPointer on them during this loop.
+
+                        operationStatus = Utf8Utility.TranscodeToUtf8(
+                            pInputBuffer: (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)),
+                            inputLength: source.Length,
+                            pOutputBuffer: (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination)),
+                            outputBytesRemaining: destination.Length,
+                            pInputBufferRemaining: out pInputBufferRemaining,
+                            pOutputBufferRemaining: out pOutputBufferRemaining);
+
+                        // If we finished the operation entirely or we ran out of space in the destination buffer,
+                        // or if we need more input data and the caller told us that there's possibly more data
+                        // coming, return immediately.
+
+                        if (operationStatus <= OperationStatus.DestinationTooSmall
+                            || (operationStatus == OperationStatus.NeedMoreData && !isFinalBlock))
+                        {
+                            break;
+                        }
+
+                        // We encountered invalid data, or we need more data but the caller told us we're
+                        // at the end of the stream. In either case treat this as truly invalid.
+                        // If the caller didn't tell us to replace invalid sequences, return immediately.
+
+                        if (!replaceInvalidSequences)
+                        {
+                            operationStatus = OperationStatus.InvalidData; // status code may have been NeedMoreData - force to be error
+                            break;
+                        }
+
+                        // We're going to attempt to write U+FFFD to the destination buffer.
+                        // Do we even have enough space to do so?
+
+                        destination = destination.Slice((int)(pOutputBufferRemaining - (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination))));
+
+                        if (2 >= (uint)destination.Length)
+                        {
+                            operationStatus = OperationStatus.DestinationTooSmall;
+                            break;
+                        }
+
+                        destination[0] = 0xEF; // U+FFFD = [ EF BF BD ] in UTF-8
+                        destination[1] = 0xBF;
+                        destination[2] = 0xBD;
+                        destination = destination.Slice(3);
+
+                        // Invalid UTF-16 sequences are always of length 1. Just skip the next character.
+
+                        source = source.Slice((int)(pInputBufferRemaining - (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source))) + 1);
+
+                        operationStatus = OperationStatus.Done; // we patched the error - if we're about to break out of the loop this is a success case
+                        pInputBufferRemaining = (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
+                        pOutputBufferRemaining = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination));
+                    }
+
+                    // Not possible to make any further progress - report to our caller how far we got.
+
+                    charsRead = (int)(pInputBufferRemaining - pOriginalSource);
+                    bytesWritten = (int)(pOutputBufferRemaining - pOriginalDestination);
+                    return operationStatus;
+                }
+            }
         }
     }
 }
diff --git a/src/DotNetty.Common/Internal/TextEncodings.Utf8.NetCore3.cs b/src/DotNetty.Common/Internal/TextEncodings.Utf8.NetCore3.cs
index c0cda71ca..200b5d460 100644
--- a/src/DotNetty.Common/Internal/TextEncodings.Utf8.NetCore3.cs
+++ b/src/DotNetty.Common/Internal/TextEncodings.Utf8.NetCore3.cs
@@ -2,8 +2,11 @@
 namespace DotNetty.Common.Internal
 {
     using System;
+    using System.Buffers;
+    using System.Diagnostics;
     using System.Runtime.CompilerServices;
     using System.Runtime.InteropServices;
+    using System.Text;
 
     public static partial class TextEncodings
     {
@@ -122,6 +125,102 @@ static ArgumentException GetArgumentException()
                     return new ArgumentException("Argument_ConversionOverflow");
                 }
             }
+
+            /// <summary>
+            /// Transcodes the UTF-8 <paramref name="source"/> buffer to <paramref name="destination"/> as UTF-16.
+            /// </summary>
+            /// <remarks>
+            /// If <paramref name="replaceInvalidSequences"/> is <see langword="true"/>, invalid UTF-8 sequences
+            /// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and
+            /// this method will not return <see cref="OperationStatus.InvalidData"/>.
+            /// </remarks>
+            public static unsafe OperationStatus ToUtf16(ReadOnlySpan<byte> source, Span<char> destination, out int bytesRead, out int charsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true)
+            {
+                // Throwaway span accesses - workaround for https://github.com/dotnet/coreclr/issues/12332
+
+                _ = source.Length;
+                _ = destination.Length;
+
+                // We'll be mutating these values throughout our loop.
+
+                fixed (byte* pOriginalSource = &MemoryMarshal.GetReference(source))
+                fixed (char* pOriginalDestination = &MemoryMarshal.GetReference(destination))
+                {
+                    // We're going to bulk transcode as much as we can in a loop, iterating
+                    // every time we see bad data that requires replacement.
+
+                    OperationStatus operationStatus = OperationStatus.Done;
+                    byte* pInputBufferRemaining = pOriginalSource;
+                    char* pOutputBufferRemaining = pOriginalDestination;
+
+                    while (!source.IsEmpty)
+                    {
+                        // We've pinned the spans at the entry point to this method.
+                        // It's safe for us to use Unsafe.AsPointer on them during this loop.
+
+                        operationStatus = Utf8Utility.TranscodeToUtf16(
+                            pInputBuffer: (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)),
+                            inputLength: source.Length,
+                            pOutputBuffer: (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination)),
+                            outputCharsRemaining: destination.Length,
+                            pInputBufferRemaining: out pInputBufferRemaining,
+                            pOutputBufferRemaining: out pOutputBufferRemaining);
+
+                        // If we finished the operation entirely or we ran out of space in the destination buffer,
+                        // or if we need more input data and the caller told us that there's possibly more data
+                        // coming, return immediately.
+
+                        if (operationStatus <= OperationStatus.DestinationTooSmall
+                            || (operationStatus == OperationStatus.NeedMoreData && !isFinalBlock))
+                        {
+                            break;
+                        }
+
+                        // We encountered invalid data, or we need more data but the caller told us we're
+                        // at the end of the stream. In either case treat this as truly invalid.
+                        // If the caller didn't tell us to replace invalid sequences, return immediately.
+
+                        if (!replaceInvalidSequences)
+                        {
+                            operationStatus = OperationStatus.InvalidData; // status code may have been NeedMoreData - force to be error
+                            break;
+                        }
+
+                        // We're going to attempt to write U+FFFD to the destination buffer.
+                        // Do we even have enough space to do so?
+
+                        destination = destination.Slice((int)(pOutputBufferRemaining - (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination))));
+
+                        if (destination.IsEmpty)
+                        {
+                            operationStatus = OperationStatus.DestinationTooSmall;
+                            break;
+                        }
+
+                        destination[0] = (char)UnicodeUtility.ReplacementChar;
+                        destination = destination.Slice(1);
+
+                        // Now figure out how many bytes of the source we must skip over before we should retry
+                        // the operation. This might be more than 1 byte.
+
+                        source = source.Slice((int)(pInputBufferRemaining - (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source))));
+                        Debug.Assert(!source.IsEmpty, "Expected 'Done' if source is fully consumed.");
+
+                        Rune.DecodeFromUtf8(source, out _, out int bytesConsumedJustNow);
+                        source = source.Slice(bytesConsumedJustNow);
+
+                        operationStatus = OperationStatus.Done; // we patched the error - if we're about to break out of the loop this is a success case
+                        pInputBufferRemaining = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
+                        pOutputBufferRemaining = (char*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination));
+                    }
+
+                    // Not possible to make any further progress - report to our caller how far we got.
+
+                    bytesRead = (int)(pInputBufferRemaining - pOriginalSource);
+                    charsWritten = (int)(pOutputBufferRemaining - pOriginalDestination);
+                    return operationStatus;
+                }
+            }
         }
     }
 }
diff --git a/src/DotNetty.Common/Internal/TextEncodings.cs b/src/DotNetty.Common/Internal/TextEncodings.cs
index f43029558..e3e252148 100644
--- a/src/DotNetty.Common/Internal/TextEncodings.cs
+++ b/src/DotNetty.Common/Internal/TextEncodings.cs
@@ -7,13 +7,13 @@
     public static partial class TextEncodings
     {
         /// <summary>不提供 Unicode 字节顺序标记，检测到无效的编码时不引发异常</summary>
-        public static readonly UTF8Encoding UTF8NoBOM = new UTF8Encoding(false);
+        public static readonly UTF8Encoding UTF8NoBOM = new(false);
 
         /// <summary>不提供 Unicode 字节顺序标记，检测到无效的编码时引发异常</summary>
-        public static readonly UTF8Encoding SecureUTF8NoBOM = new UTF8Encoding(false, true);
+        public static readonly UTF8Encoding SecureUTF8NoBOM = new(false, true);
 
         /// <summary>提供 Unicode 字节顺序标记，检测到无效的编码时引发异常</summary>
-        public static readonly UTF8Encoding SecureUTF8 = new UTF8Encoding(true, true);
+        public static readonly UTF8Encoding SecureUTF8 = new(true, true);
 
         public const int ASCIICodePage = 20127;
 
diff --git a/src/DotNetty.Common/Internal/UnicodeDebug.cs b/src/DotNetty.Common/Internal/UnicodeDebug.cs
index 095370d94..c1b04d2ed 100644
--- a/src/DotNetty.Common/Internal/UnicodeDebug.cs
+++ b/src/DotNetty.Common/Internal/UnicodeDebug.cs
@@ -13,31 +13,46 @@ internal static class UnicodeDebug
         [Conditional("DEBUG")]
         internal static void AssertIsHighSurrogateCodePoint(uint codePoint)
         {
-            Debug.Assert(UnicodeUtility.IsHighSurrogateCodePoint(codePoint), $"The value {ToHexString(codePoint)} is not a valid UTF-16 high surrogate code point.");
+            if (!UnicodeUtility.IsHighSurrogateCodePoint(codePoint))
+            {
+                Debug.Fail($"The value {ToHexString(codePoint)} is not a valid UTF-16 high surrogate code point.");
+            }
         }
 
         [Conditional("DEBUG")]
         internal static void AssertIsLowSurrogateCodePoint(uint codePoint)
         {
-            Debug.Assert(UnicodeUtility.IsLowSurrogateCodePoint(codePoint), $"The value {ToHexString(codePoint)} is not a valid UTF-16 low surrogate code point.");
+            if (!UnicodeUtility.IsLowSurrogateCodePoint(codePoint))
+            {
+                Debug.Fail($"The value {ToHexString(codePoint)} is not a valid UTF-16 low surrogate code point.");
+            }
         }
 
         [Conditional("DEBUG")]
         internal static void AssertIsValidCodePoint(uint codePoint)
         {
-            Debug.Assert(UnicodeUtility.IsValidCodePoint(codePoint), $"The value {ToHexString(codePoint)} is not a valid Unicode code point.");
+            if (!UnicodeUtility.IsValidCodePoint(codePoint))
+            {
+                Debug.Fail($"The value {ToHexString(codePoint)} is not a valid Unicode code point.");
+            }
         }
 
         [Conditional("DEBUG")]
         internal static void AssertIsValidScalar(uint scalarValue)
         {
-            Debug.Assert(UnicodeUtility.IsValidUnicodeScalar(scalarValue), $"The value {ToHexString(scalarValue)} is not a valid Unicode scalar value.");
+            if (!UnicodeUtility.IsValidUnicodeScalar(scalarValue))
+            {
+                Debug.Fail($"The value {ToHexString(scalarValue)} is not a valid Unicode scalar value.");
+            }
         }
 
         [Conditional("DEBUG")]
         internal static void AssertIsValidSupplementaryPlaneScalar(uint scalarValue)
         {
-            Debug.Assert(UnicodeUtility.IsValidUnicodeScalar(scalarValue) && !UnicodeUtility.IsBmpCodePoint(scalarValue), $"The value {ToHexString(scalarValue)} is not a valid supplementary plane Unicode scalar value.");
+            if (!UnicodeUtility.IsValidUnicodeScalar(scalarValue) || UnicodeUtility.IsBmpCodePoint(scalarValue))
+            {
+                Debug.Fail($"The value {ToHexString(scalarValue)} is not a valid supplementary plane Unicode scalar value.");
+            }
         }
 
         /// <summary>
diff --git a/src/DotNetty.Common/Internal/UnicodeUtility.cs b/src/DotNetty.Common/Internal/UnicodeUtility.cs
index da12856f4..d2c98244a 100644
--- a/src/DotNetty.Common/Internal/UnicodeUtility.cs
+++ b/src/DotNetty.Common/Internal/UnicodeUtility.cs
@@ -120,14 +120,14 @@ public static int GetUtf8SequenceLength(uint value)
         /// Per http://www.unicode.org/glossary/#ASCII, ASCII is only U+0000..U+007F.
         /// </remarks>
         [MethodImpl(InlineMethod.AggressiveOptimization)]
-        public static bool IsAsciiCodePoint(uint value) => (value <= 0x7Fu);
+        public static bool IsAsciiCodePoint(uint value) => value <= 0x7Fu;
 
         /// <summary>
         /// Returns <see langword="true"/> iff <paramref name="value"/> is in the
         /// Basic Multilingual Plane (BMP).
         /// </summary>
         [MethodImpl(InlineMethod.AggressiveOptimization)]
-        public static bool IsBmpCodePoint(uint value) => (value <= 0xFFFFu);
+        public static bool IsBmpCodePoint(uint value) => value <= 0xFFFFu;
 
         /// <summary>
         /// Returns <see langword="true"/> iff <paramref name="value"/> is a UTF-16 high surrogate code point,
@@ -142,7 +142,7 @@ public static int GetUtf8SequenceLength(uint value)
         /// </summary>
         [MethodImpl(InlineMethod.AggressiveOptimization)]
         public static bool IsInRangeInclusive(uint value, uint lowerBound, uint upperBound)
-            => ((value - lowerBound) <= (upperBound - lowerBound));
+            => (value - lowerBound) <= (upperBound - lowerBound);
 
         ///// <summary>
         ///// Returns <see langword="true"/> if <paramref name="value"/> is between
@@ -187,7 +187,7 @@ public static bool IsInRangeInclusive(long value, long lowerBound, long upperBou
         /// point, i.e., is in [ U+0000..U+10FFFF ], inclusive.
         /// </summary>
         [MethodImpl(InlineMethod.AggressiveOptimization)]
-        public static bool IsValidCodePoint(uint codePoint) => (codePoint <= 0x10FFFFU);
+        public static bool IsValidCodePoint(uint codePoint) => codePoint <= 0x10FFFFU;
 
         /// <summary>
         /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
@@ -197,7 +197,7 @@ public static bool IsInRangeInclusive(long value, long lowerBound, long upperBou
         public static bool IsValidUnicodeScalar(uint value)
         {
             // This is an optimized check that on x86 is just three instructions: lea, xor, cmp.
-            // 
+            //
             // After the subtraction operation, the input value is modified as such:
             // [ 00000000..0010FFFF ] -> [ FFEF0000..FFFFFFFF ]
             //
diff --git a/src/DotNetty.Common/Internal/Utf16Utility.Validation.Net.cs b/src/DotNetty.Common/Internal/Utf16Utility.Validation.Net.cs
new file mode 100644
index 000000000..73ec52772
--- /dev/null
+++ b/src/DotNetty.Common/Internal/Utf16Utility.Validation.Net.cs
@@ -0,0 +1,508 @@
+﻿// borrowed from https://github.com/dotnet/corefx/tree/release/3.1/src/Common/src/CoreLib/System/Text/Unicode
+
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#if NET
+using System;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.X86;
+using System.Numerics;
+using nuint_64 = System.UInt64;
+using nuint_32 = System.UInt32;
+
+namespace DotNetty.Common.Internal
+{
+    internal static unsafe partial class Utf16Utility
+    {
+        // Returns &inputBuffer[inputLength] if the input buffer is valid.
+        /// <summary>
+        /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>,
+        /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>.
+        /// </summary>
+        /// <remarks>
+        /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed.
+        /// </remarks>
+        public static char* GetPointerToFirstInvalidChar(char* pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment)
+        {
+            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
+            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
+
+            // First, we'll handle the common case of all-ASCII. If this is able to
+            // consume the entire buffer, we'll skip the remainder of this method's logic.
+
+            int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength);
+            Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength);
+
+            pInputBuffer += (uint)numAsciiCharsConsumedJustNow;
+            inputLength -= numAsciiCharsConsumedJustNow;
+
+            if (0u >= (uint)inputLength)
+            {
+                utf8CodeUnitCountAdjustment = 0;
+                scalarCountAdjustment = 0;
+                return pInputBuffer;
+            }
+
+            // If we got here, it means we saw some non-ASCII data, so within our
+            // vectorized code paths below we'll handle all non-surrogate UTF-16
+            // code points branchlessly. We'll only branch if we see surrogates.
+            //
+            // We still optimistically assume the data is mostly ASCII. This means that the
+            // number of UTF-8 code units and the number of scalars almost matches the number
+            // of UTF-16 code units. As we go through the input and find non-ASCII
+            // characters, we'll keep track of these "adjustment" fixups. To get the
+            // total number of UTF-8 code units required to encode the input data, add
+            // the UTF-8 code unit count adjustment to the number of UTF-16 code units
+            // seen.  To get the total number of scalars present in the input data,
+            // add the scalar count adjustment to the number of UTF-16 code units seen.
+
+            long tempUtf8CodeUnitCountAdjustment = 0;
+            int tempScalarCountAdjustment = 0;
+
+            // Per https://github.com/dotnet/runtime/issues/41699, temporarily disabling
+            // ARM64-intrinsicified code paths. ARM64 platforms may still use the vectorized
+            // non-intrinsicified 'else' block below.
+
+            if (/* (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || */ Sse2.IsSupported)
+            {
+                if (inputLength >= Vector128<ushort>.Count)
+                {
+                    Vector128<ushort> vector0080 = Vector128.Create((ushort)0x80);
+                    Vector128<ushort> vectorA800 = Vector128.Create((ushort)0xA800);
+                    Vector128<short> vector8800 = Vector128.Create(unchecked((short)0x8800));
+                    Vector128<ushort> vectorZero = Vector128<ushort>.Zero;
+
+                    Vector128<byte> bitMask128 = BitConverter.IsLittleEndian ?
+                        Vector128.Create(0x80402010_08040201).AsByte() :
+                        Vector128.Create(0x01020408_10204080).AsByte();
+
+                    do
+                    {
+                        Vector128<ushort> utf16Data;
+                        if (AdvSimd.Arm64.IsSupported)
+                        {
+                            utf16Data = AdvSimd.LoadVector128((ushort*)pInputBuffer); // unaligned
+                        }
+                        else
+                        {
+                            utf16Data = Sse2.LoadVector128((ushort*)pInputBuffer); // unaligned
+                        }
+
+                        Vector128<ushort> charIsNonAscii;
+
+                        if (AdvSimd.Arm64.IsSupported)
+                        {
+                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
+                            // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.)
+                            charIsNonAscii = AdvSimd.Min(utf16Data, vector0080);
+                        }
+                        else if (Sse41.IsSupported)
+                        {
+                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
+                            // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.)
+                            charIsNonAscii = Sse41.Min(utf16Data, vector0080);
+                        }
+                        else
+                        {
+                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
+                            // input was 0x0080 <= [value] <= 0x7FFF. The case where 0x8000 <= [value] will
+                            // be handled in a few lines.
+
+                            charIsNonAscii = Sse2.AndNot(Sse2.CompareGreaterThan(vector0080.AsInt16(), utf16Data.AsInt16()).AsUInt16(), vector0080);
+                        }
+
+#if DEBUG
+                        // Quick check to ensure we didn't accidentally set the 0x8000 bit of any element.
+                        uint debugMask;
+                        if (AdvSimd.Arm64.IsSupported)
+                        {
+                            debugMask = GetNonAsciiBytes(charIsNonAscii.AsByte(), bitMask128);
+                        }
+                        else
+                        {
+                            debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte());
+                        }
+                        Debug.Assert((debugMask & 0b_1010_1010_1010_1010) == 0, "Shouldn't have set the 0x8000 bit of any element in 'charIsNonAscii'.");
+#endif // DEBUG
+
+                        // Sets the 0x8080 bits of each element in 'charIsNonAscii' if the corresponding
+                        // input was 0x0800 <= [value]. This also handles the missing range a few lines above.
+
+                        Vector128<ushort> charIsThreeByteUtf8Encoded;
+                        uint mask;
+
+                        if (AdvSimd.IsSupported)
+                        {
+                            charIsThreeByteUtf8Encoded = AdvSimd.Subtract(vectorZero, AdvSimd.ShiftRightLogical(utf16Data, 11));
+                            mask = GetNonAsciiBytes(AdvSimd.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte(), bitMask128);
+                        }
+                        else
+                        {
+                            charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11));
+                            mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte());
+                        }
+
+                        // Each even bit of mask will be 1 only if the char was >= 0x0080,
+                        // and each odd bit of mask will be 1 only if the char was >= 0x0800.
+                        //
+                        // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...":
+                        //
+                        //            ,-- set if char[1] is >= 0x0800
+                        //            |   ,-- set if char[0] is >= 0x0800
+                        //            v   v
+                        // mask = ... 1 1 0 1
+                        //              ^   ^-- set if char[0] is non-ASCII
+                        //              `-- set if char[1] is non-ASCII
+                        //
+                        // This means we can popcnt the number of set bits, and the result is the
+                        // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as
+                        // it expands. This results in the wrong count for UTF-16 surrogate code
+                        // units (we just counted that each individual code unit expands to 3 bytes,
+                        // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes).
+                        // We'll handle this in just a moment.
+                        //
+                        // For now, compute the popcnt but squirrel it away. We'll fold it in to the
+                        // cumulative UTF-8 adjustment factor once we determine that there are no
+                        // unpaired surrogates in our data. (Unpaired surrogates would invalidate
+                        // our computed result and we'd have to throw it away.)
+
+                        uint popcnt = (uint)BitOperations.PopCount(mask);
+
+                        // Surrogates need to be special-cased for two reasons: (a) we need
+                        // to account for the fact that we over-counted in the addition above;
+                        // and (b) they require separate validation.
+                        if (AdvSimd.Arm64.IsSupported)
+                        {
+                            utf16Data = AdvSimd.Add(utf16Data, vectorA800);
+                            mask = GetNonAsciiBytes(AdvSimd.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte(), bitMask128);
+                        }
+                        else
+                        {
+                            utf16Data = Sse2.Add(utf16Data, vectorA800);
+                            mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());
+                        }
+
+                        if (mask != 0)
+                        {
+                            // There's at least one UTF-16 surrogate code unit present.
+                            // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw,
+                            // the resulting bits of 'mask' will occur in pairs:
+                            // - 00 if the corresponding UTF-16 char was not a surrogate code unit;
+                            // - 11 if the corresponding UTF-16 char was a surrogate code unit.
+                            //
+                            // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ],
+                            // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents
+                            // a low surrogate. Since we added 0xA800 in the vectorized operation above,
+                            // our surrogate pairs will now have the bit pattern [ 10000q## ######## ].
+                            // If we logical right-shift each word by 3, we'll end up with the bit pattern
+                            // [ 00010000 q####### ], which means that we can immediately use pmovmskb to
+                            // determine whether a given char was a high or a low surrogate.
+                            //
+                            // Therefore the resulting bits of 'mask2' will occur in pairs:
+                            // - 00 if the corresponding UTF-16 char was a high surrogate code unit;
+                            // - 01 if the corresponding UTF-16 char was a low surrogate code unit;
+                            // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit.
+                            //   Since 'mask' already has 00 in these positions (since the corresponding char
+                            //   wasn't a surrogate), "mask AND mask2 == 00" holds for these positions.
+
+                            uint mask2;
+                            if (AdvSimd.Arm64.IsSupported)
+                            {
+                                mask2 = GetNonAsciiBytes(AdvSimd.ShiftRightLogical(utf16Data, 3).AsByte(), bitMask128);
+                            }
+                            else
+                            {
+                                mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte());
+                            }
+
+                            // 'lowSurrogatesMask' has its bits occur in pairs:
+                            // - 01 if the corresponding char was a low surrogate char,
+                            // - 00 if the corresponding char was a high surrogate char or not a surrogate at all.
+
+                            uint lowSurrogatesMask = mask2 & mask;
+
+                            // 'highSurrogatesMask' has its bits occur in pairs:
+                            // - 01 if the corresponding char was a high surrogate char,
+                            // - 00 if the corresponding char was a low surrogate char or not a surrogate at all.
+
+                            uint highSurrogatesMask = (mask2 ^ 0b_0101_0101_0101_0101u /* flip all even-numbered bits 00 <-> 01 */) & mask;
+
+                            Debug.Assert((highSurrogatesMask & lowSurrogatesMask) == 0,
+                                "A char cannot simultaneously be both a high and a low surrogate char.");
+
+                            Debug.Assert(((highSurrogatesMask | lowSurrogatesMask) & 0b_1010_1010_1010_1010u) == 0,
+                                "Only even bits (no odd bits) of the masks should be set.");
+
+                            // Now check that each high surrogate is followed by a low surrogate and that each
+                            // low surrogate follows a high surrogate. We make an exception for the case where
+                            // the final char of the vector is a high surrogate, since we can't perform validation
+                            // on it until the next iteration of the loop when we hope to consume the matching
+                            // low surrogate.
+
+                            highSurrogatesMask <<= 2;
+                            if ((ushort)highSurrogatesMask != lowSurrogatesMask)
+                            {
+                                goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
+                            }
+
+                            if (highSurrogatesMask > ushort.MaxValue)
+                            {
+                                // There was a standalone high surrogate at the end of the vector.
+                                // We'll adjust our counters so that we don't consider this char consumed.
+
+                                highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt
+                                popcnt -= 2; // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here
+                                pInputBuffer--;
+                                inputLength++;
+                            }
+
+                            // If we're 64-bit, we can perform the zero-extension of the surrogate pairs count for
+                            // free right now, saving the extension step a few lines below. If we're 32-bit, the
+                            // convertion to nuint immediately below is a no-op, and we'll pay the cost of the real
+                            // 64 -bit extension a few lines below.
+                            nuint surrogatePairsCountNuint = (uint)BitOperations.PopCount(highSurrogatesMask);
+
+                            // 2 UTF-16 chars become 1 Unicode scalar
+
+                            tempScalarCountAdjustment -= (int)surrogatePairsCountNuint;
+
+                            // Since each surrogate code unit was >= 0x0800, we eagerly assumed
+                            // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation
+                            // assumes that the pair is encoded as 6 UTF-8 code units. Since each
+                            // pair is in reality only encoded as 4 UTF-8 code units, we need to
+                            // perform this adjustment now.
+
+                            if (PlatformDependent.Is64BitProcess)
+                            {
+                                // Since we've already zero-extended surrogatePairsCountNuint, we can directly
+                                // sub + sub. It's more efficient than shl + sub.
+                                tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint;
+                                tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint;
+                            }
+                            else
+                            {
+                                // Take the hit of the 64-bit extension now.
+                                tempUtf8CodeUnitCountAdjustment -= 2 * (uint)surrogatePairsCountNuint;
+                            }
+                        }
+
+                        tempUtf8CodeUnitCountAdjustment += popcnt;
+                        pInputBuffer += Vector128<ushort>.Count;
+                        inputLength -= Vector128<ushort>.Count;
+                    } while (inputLength >= Vector128<ushort>.Count);
+                }
+            }
+            else if (Vector.IsHardwareAccelerated)
+            {
+                if (inputLength >= Vector<ushort>.Count)
+                {
+                    Vector<ushort> vector0080 = new Vector<ushort>(0x0080);
+                    Vector<ushort> vector0400 = new Vector<ushort>(0x0400);
+                    Vector<ushort> vector0800 = new Vector<ushort>(0x0800);
+                    Vector<ushort> vectorD800 = new Vector<ushort>(0xD800);
+
+                    do
+                    {
+                        // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain
+                        // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding
+                        // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these
+                        // vectors, each element of the sum will contain one of three values:
+                        //
+                        // 0x0000 ( 0) = original char was 0000..007F
+                        // 0xFFFF (-1) = original char was 0080..07FF
+                        // 0xFFFE (-2) = original char was 0800..FFFF
+                        //
+                        // We'll negate them to produce a value 0..2 for each element, then sum all the
+                        // elements together to produce the number of *additional* UTF-8 code units
+                        // required to represent this UTF-16 data. This is similar to the popcnt step
+                        // performed by the SSE2 code path. This will overcount surrogates, but we'll
+                        // handle that shortly.
+
+                        Vector<ushort> utf16Data = Unsafe.ReadUnaligned<Vector<ushort>>(pInputBuffer);
+                        Vector<ushort> twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080);
+                        Vector<ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800);
+                        nuint popcnt = 0;
+                        if (PlatformDependent.Is64BitProcess)
+                        {
+                            Vector<nuint_64> sumVector = (Vector<nuint_64>)(Vector<ushort>.Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes);
+
+                            // We'll try summing by a natural word (rather than a 16-bit word) at a time,
+                            // which should halve the number of operations we must perform.
+
+                            for (int i = 0; i < Vector<nuint_64>.Count; i++)
+                            {
+                                popcnt += (nuint)sumVector[i];
+                            }
+                        }
+                        else
+                        {
+                            Vector<nuint_32> sumVector = (Vector<nuint_32>)(Vector<ushort>.Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes);
+
+                            // We'll try summing by a natural word (rather than a 16-bit word) at a time,
+                            // which should halve the number of operations we must perform.
+
+                            for (int i = 0; i < Vector<nuint_32>.Count; i++)
+                            {
+                                popcnt += (nuint)sumVector[i];
+                            }
+                        }
+
+                        uint popcnt32 = (uint)popcnt;
+                        if (PlatformDependent.Is64BitProcess)
+                        {
+                            popcnt32 += (uint)(popcnt >> 32);
+                        }
+
+                        // As in the SSE4.1 paths, compute popcnt but don't fold it in until we
+                        // know there aren't any unpaired surrogates in the input data.
+
+                        popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16);
+
+                        // Now check for surrogates.
+
+                        utf16Data -= vectorD800;
+                        Vector<ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800);
+                        if (surrogateChars != Vector<ushort>.Zero)
+                        {
+                            // There's at least one surrogate (high or low) UTF-16 code unit in
+                            // the vector. We'll build up additional vectors: 'highSurrogateChars'
+                            // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original
+                            // UTF-16 code unit was a high or low surrogate, respectively.
+
+                            Vector<ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400);
+                            Vector<ushort> lowSurrogateChars = Vector.AndNot(surrogateChars, highSurrogateChars);
+
+                            // We want to make sure that each high surrogate code unit is followed by
+                            // a low surrogate code unit and each low surrogate code unit follows a
+                            // high surrogate code unit. Since we don't have an equivalent of pmovmskb
+                            // or palignr available to us, we'll do this as a loop. We won't look at
+                            // the very last high surrogate char element since we don't yet know if
+                            // the next vector read will have a low surrogate char element.
+
+                            if (lowSurrogateChars[0] != 0)
+                            {
+                                goto Error; // error: start of buffer contains standalone low surrogate char
+                            }
+
+                            ushort surrogatePairsCount = 0;
+                            for (int i = 0; i < Vector<ushort>.Count - 1; i++)
+                            {
+                                surrogatePairsCount -= highSurrogateChars[i]; // turns into +1 or +0
+                                if (highSurrogateChars[i] != lowSurrogateChars[i + 1])
+                                {
+                                    goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
+                                }
+                            }
+
+                            if (highSurrogateChars[Vector<ushort>.Count - 1] != 0)
+                            {
+                                // There was a standalone high surrogate at the end of the vector.
+                                // We'll adjust our counters so that we don't consider this char consumed.
+
+                                pInputBuffer--;
+                                inputLength++;
+                                popcnt32 -= 2;
+                            }
+
+                            nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size
+
+                            // 2 UTF-16 chars become 1 Unicode scalar
+
+                            tempScalarCountAdjustment -= (int)surrogatePairsCountNint;
+
+                            // Since each surrogate code unit was >= 0x0800, we eagerly assumed
+                            // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only
+                            // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total),
+                            // so we'll adjust this now.
+
+                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
+                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
+                        }
+
+                        tempUtf8CodeUnitCountAdjustment += popcnt32;
+                        pInputBuffer += Vector<ushort>.Count;
+                        inputLength -= Vector<ushort>.Count;
+                    } while (inputLength >= Vector<ushort>.Count);
+                }
+            }
+
+        NonVectorizedLoop:
+
+            // Vectorization isn't supported on our current platform, or the input was too small to benefit
+            // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to
+            // drain remaining valid chars before we report failure.
+
+            for (; inputLength > 0; pInputBuffer++, inputLength--)
+            {
+                uint thisChar = pInputBuffer[0];
+                if (thisChar <= 0x7F)
+                {
+                    continue;
+                }
+
+                // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF.
+                // This optimistically assumes no surrogates, which we'll handle shortly.
+
+                tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16;
+
+                if (!UnicodeUtility.IsSurrogateCodePoint(thisChar))
+                {
+                    continue;
+                }
+
+                // Found a surrogate char. Back out the adjustment we made above, then
+                // try to consume the entire surrogate pair all at once. We won't bother
+                // trying to interpret the surrogate pair as a scalar value; we'll only
+                // validate that its bit pattern matches what's expected for a surrogate pair.
+
+                tempUtf8CodeUnitCountAdjustment -= 2;
+
+                if (inputLength == 1)
+                {
+                    goto Error; // input buffer too small to read a surrogate pair
+                }
+
+                thisChar = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0)
+                {
+                    goto Error; // not a well-formed surrogate pair
+                }
+
+                tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar
+                tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units
+
+                pInputBuffer++; // consumed one extra char
+                inputLength--;
+            }
+
+        Error:
+
+            // Also used for normal return.
+
+            utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment;
+            scalarCountAdjustment = tempScalarCountAdjustment;
+            return pInputBuffer;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static uint GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bitMask128)
+        {
+            Debug.Assert(AdvSimd.Arm64.IsSupported);
+
+            Vector128<byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte();
+            Vector128<byte> extractedBits = AdvSimd.And(mostSignificantBitIsSet, bitMask128);
+
+            // self-pairwise add until all flags have moved to the first two bytes of the vector
+            extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
+            extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
+            extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
+            return extractedBits.AsUInt16().ToScalar();
+        }
+    }
+}
+#endif
diff --git a/src/DotNetty.Common/Internal/Utf16Utility.Validation.cs b/src/DotNetty.Common/Internal/Utf16Utility.Validation.NetCore3.cs
similarity index 99%
rename from src/DotNetty.Common/Internal/Utf16Utility.Validation.cs
rename to src/DotNetty.Common/Internal/Utf16Utility.Validation.NetCore3.cs
index c4f438c30..dfff6de9b 100644
--- a/src/DotNetty.Common/Internal/Utf16Utility.Validation.cs
+++ b/src/DotNetty.Common/Internal/Utf16Utility.Validation.NetCore3.cs
@@ -4,13 +4,13 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
-#if NETCOREAPP_3_0_GREATER
+#if NETCOREAPP3_1
 using System;
 using System.Diagnostics;
+using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 using System.Numerics;
-using System.Runtime.CompilerServices;
 using nuint_64 = System.UInt64;
 using nuint_32 = System.UInt32;
 
@@ -50,7 +50,7 @@ internal static unsafe partial class Utf16Utility
             // If we got here, it means we saw some non-ASCII data, so within our
             // vectorized code paths below we'll handle all non-surrogate UTF-16
             // code points branchlessly. We'll only branch if we see surrogates.
-            // 
+            //
             // We still optimistically assume the data is mostly ASCII. This means that the
             // number of UTF-8 code units and the number of scalars almost matches the number
             // of UTF-16 code units. As we go through the input and find non-ASCII
diff --git a/src/DotNetty.Common/Internal/Utf16Utility.cs b/src/DotNetty.Common/Internal/Utf16Utility.cs
index 7c7d93e97..29b1ced1a 100644
--- a/src/DotNetty.Common/Internal/Utf16Utility.cs
+++ b/src/DotNetty.Common/Internal/Utf16Utility.cs
@@ -152,6 +152,7 @@ internal static bool UInt32OrdinalIgnoreCaseAscii(uint valueA, uint valueB)
             Debug.Assert(AllCharsInUInt32AreAscii(valueA));
             Debug.Assert(AllCharsInUInt32AreAscii(valueB));
 
+#if NETCOREAPP3_1
             // a mask of all bits which are different between A and B
             uint differentBits = valueA ^ valueB;
 
@@ -177,6 +178,48 @@ internal static bool UInt32OrdinalIgnoreCaseAscii(uint valueA, uint valueB)
             // computation we performed at the beginning of the method.
 
             return 0u >= (((combinedIndicator >> 2) | ~0x0020_0020u) & differentBits);
+#else
+            // Generate a mask of all bits which are different between A and B. Since [A-Z]
+            // and [a-z] differ by the 0x20 bit, we'll left-shift this by 2 now so that
+            // this is moved over to the 0x80 bit, which nicely aligns with the calculation
+            // we're going to do on the indicator flag later.
+            //
+            // n.b. All of the logic below assumes we have at least 2 "known zero" bits leading
+            // each of the 7-bit ASCII values. This assumption won't hold if this method is
+            // ever adapted to deal with packed bytes instead of packed chars.
+
+            uint differentBits = (valueA ^ valueB) << 2;
+
+            // Now, we want to generate a mask where for each word in the input, the mask contains
+            // 0xFF7F if the word is [A-Za-z], 0xFFFF if the word is not [A-Za-z]. We know each
+            // input word is ASCII (only low 7 bit set), so we can use a combination of addition
+            // and logical operators as follows.
+            //
+            // original input   +05         |A0         +1A
+            // ====================================================
+            //         00 .. 3F -> 05 .. 44 -> A5 .. E4 -> BF .. FE
+            //               40 ->       45 ->       E5 ->       FF
+            // ([A-Z]) 41 .. 5A -> 46 .. 5F -> E6 .. FF -> 00 .. 19
+            //         5B .. 5F -> 60 .. 64 -> E0 .. E4 -> FA .. FE
+            //               60 ->       65 ->       E5 ->       FF
+            // ([a-z]) 61 .. 7A -> 66 .. 7F -> E6 .. FF -> 00 .. 19
+            //         7B .. 7F -> 80 .. 84 -> A0 .. A4 -> BA .. BE
+            //
+            // This combination of operations results in the 0x80 bit of each word being set
+            // iff the original word value was *not* [A-Za-z].
+
+            uint indicator = valueA + 0x0005_0005u;
+            indicator |= 0x00A0_00A0u;
+            indicator += 0x001A_001Au;
+            indicator |= 0xFF7F_FF7Fu; // normalize each word to 0xFF7F or 0xFFFF
+
+            // At this point, 'indicator' contains the mask of bits which are *not* allowed to
+            // differ between the inputs, and 'differentBits' contains the mask of bits which
+            // actually differ between the inputs. If these masks have any bits in common, then
+            // the two values are *not* equal under an OrdinalIgnoreCase comparer.
+
+            return 0u >= (differentBits & indicator);
+#endif
         }
 
         /// <summary>
@@ -193,6 +236,7 @@ internal static bool UInt64OrdinalIgnoreCaseAscii(ulong valueA, ulong valueB)
             Debug.Assert(AllCharsInUInt64AreAscii(valueA));
             Debug.Assert(AllCharsInUInt64AreAscii(valueB));
 
+#if NETCOREAPP3_1
             // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value >= 'A'
             ulong lowerIndicator = valueA + 0x0080_0080_0080_0080ul - 0x0041_0041_0041_0041ul;
 
@@ -213,6 +257,17 @@ internal static bool UInt64OrdinalIgnoreCaseAscii(ulong valueA, ulong valueB)
             // happens to be faster on x64.
 
             return (valueA | combinedIndicator) == (valueB | combinedIndicator);
+#else
+            // Duplicate of logic in UInt32OrdinalIgnoreCaseAscii, but using 64-bit consts.
+            // See comments in that method for more info.
+
+            ulong differentBits = (valueA ^ valueB) << 2;
+            ulong indicator = valueA + 0x0005_0005_0005_0005ul;
+            indicator |= 0x00A0_00A0_00A0_00A0ul;
+            indicator += 0x001A_001A_001A_001Aul;
+            indicator |= 0xFF7F_FF7F_FF7F_FF7Ful;
+            return 0ul >= (differentBits & indicator);
+#endif
         }
     }
 }
diff --git a/src/DotNetty.Common/Internal/Utf8Utility.Helpers.cs b/src/DotNetty.Common/Internal/Utf8Utility.Helpers.cs
index f88c4b8ab..dfaba7a4e 100644
--- a/src/DotNetty.Common/Internal/Utf8Utility.Helpers.cs
+++ b/src/DotNetty.Common/Internal/Utf8Utility.Helpers.cs
@@ -10,18 +10,20 @@
 using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.CompilerServices;
+#if NETCOREAPP3_1
 using System.Runtime.Intrinsics.X86;
+#endif
 
 namespace DotNetty.Common.Internal
 {
-    internal static partial class Utf8Utility
+    partial class Utf8Utility
     {
         /// <summary>
         /// Given a machine-endian DWORD which four bytes of UTF-8 data, interprets the
         /// first three bytes as a three-byte UTF-8 subsequence and returns the UTF-16 representation.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static uint ExtractCharFromFirstThreeByteSequence(uint value)
+        private static uint ExtractCharFromFirstThreeByteSequence(uint value)
         {
             Debug.Assert(UInt32BeginsWithUtf8ThreeByteMask(value));
 
@@ -46,7 +48,7 @@ internal static uint ExtractCharFromFirstThreeByteSequence(uint value)
         /// first two bytes as a two-byte UTF-8 subsequence and returns the UTF-16 representation.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static uint ExtractCharFromFirstTwoByteSequence(uint value)
+        private static uint ExtractCharFromFirstTwoByteSequence(uint value)
         {
             Debug.Assert(UInt32BeginsWithUtf8TwoByteMask(value) && !UInt32BeginsWithOverlongUtf8TwoByteSequence(value));
 
@@ -68,10 +70,11 @@ internal static uint ExtractCharFromFirstTwoByteSequence(uint value)
         /// four-byte UTF-8 sequence and returns the machine-endian DWORD of the UTF-16 representation.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static uint ExtractCharsFromFourByteSequence(uint value)
+        private static uint ExtractCharsFromFourByteSequence(uint value)
         {
             if (BitConverter.IsLittleEndian)
             {
+#if NETCOREAPP3_1
                 if (Bmi2.IsSupported)
                 {
                     // need to reverse endianness for bit manipulation to work correctly
@@ -91,6 +94,7 @@ internal static uint ExtractCharsFromFourByteSequence(uint value)
                 }
                 else
                 {
+#endif
                     // input is UTF8 [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] = scalar 000uuuuu zzzzyyyy yyxxxxxx
                     // want to return UTF16 scalar 000uuuuuzzzzyyyyyyxxxxxx = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
                     // where wwww = uuuuu - 1
@@ -104,7 +108,9 @@ internal static uint ExtractCharsFromFourByteSequence(uint value)
                     retVal += 0x0000_0800u; // retVal = [ 000000yy yyxxxxxx 110110ww wwzzzzyy ]
                     retVal += 0xDC00_0000u; // retVal = [ 110111yy yyxxxxxx 110110ww wwzzzzyy ]
                     return retVal;
+#if NETCOREAPP3_1
                 }
+#endif
             }
             else
             {
@@ -129,7 +135,7 @@ internal static uint ExtractCharsFromFourByteSequence(uint value)
         /// returns the packed 4-byte UTF-8 representation of this scalar value, also in machine-endian order.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static uint ExtractFourUtf8BytesFromSurrogatePair(uint value)
+        private static uint ExtractFourUtf8BytesFromSurrogatePair(uint value)
         {
             Debug.Assert(IsWellFormedUtf16SurrogatePair(value));
 
@@ -138,6 +144,7 @@ internal static uint ExtractFourUtf8BytesFromSurrogatePair(uint value)
                 // input = [ 110111yyyyxxxxxx 110110wwwwzzzzyy ] = scalar (000uuuuu zzzzyyyy yyxxxxxx)
                 // must return [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ], where wwww = uuuuu - 1
 
+#if NETCOREAPP3_1
                 if (Bmi2.IsSupported)
                 {
                     // Since pdep and pext have high latencies and can only be dispatched to a single execution port, we want
@@ -155,6 +162,7 @@ internal static uint ExtractFourUtf8BytesFromSurrogatePair(uint value)
                 }
                 else
                 {
+#endif
                     value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ]
 
                     uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ]
@@ -167,8 +175,10 @@ internal static uint ExtractFourUtf8BytesFromSurrogatePair(uint value)
                     uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ]
                     tempD |= 0x8080_80F0u;
 
-                    return (tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
+                    return tempD | tempA | tempC; // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ]
+#if NETCOREAPP3_1
                 }
+#endif
             }
             else
             {
@@ -187,7 +197,7 @@ internal static uint ExtractFourUtf8BytesFromSurrogatePair(uint value)
                 tempD |= tempC;
 
                 uint tempE = (value & 0x3Fu) + 0xF080_8080u; // = [ 11110000 10000000 10000000 10xxxxxx ]
-                return (tempE | tempB | tempD); // = [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
+                return tempE | tempB | tempD; // = [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
             }
         }
 
@@ -199,7 +209,7 @@ internal static uint ExtractFourUtf8BytesFromSurrogatePair(uint value)
         /// <param name="value"></param>
         /// <returns></returns>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static uint ExtractTwoCharsPackedFromTwoAdjacentTwoByteSequences(uint value)
+        private static uint ExtractTwoCharsPackedFromTwoAdjacentTwoByteSequences(uint value)
         {
             // We don't want to swap the position of the high and low WORDs,
             // as the buffer was read in machine order and will be written in
@@ -223,7 +233,7 @@ internal static uint ExtractTwoCharsPackedFromTwoAdjacentTwoByteSequences(uint v
         /// adjacent UTF-8 two-byte sequences.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static uint ExtractTwoUtf8TwoByteSequencesFromTwoPackedUtf16Chars(uint value)
+        private static uint ExtractTwoUtf8TwoByteSequencesFromTwoPackedUtf16Chars(uint value)
         {
             // stays in machine endian
 
@@ -251,7 +261,7 @@ internal static uint ExtractTwoUtf8TwoByteSequencesFromTwoPackedUtf16Chars(uint
         /// as a UTF-8 two-byte sequence packed into a WORD and zero-extended to DWORD.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static uint ExtractUtf8TwoByteSequenceFromFirstUtf16Char(uint value)
+        private static uint ExtractUtf8TwoByteSequenceFromFirstUtf16Char(uint value)
         {
             // stays in machine endian
 
@@ -282,7 +292,7 @@ internal static uint ExtractUtf8TwoByteSequenceFromFirstUtf16Char(uint value)
         /// returns true iff the first UTF-16 character is ASCII.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool IsFirstCharAscii(uint value)
+        private static bool IsFirstCharAscii(uint value)
         {
             // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0000..007F ].
             // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0000..007F ].
@@ -299,7 +309,7 @@ internal static bool IsFirstCharAscii(uint value)
         /// This also returns true if the first UTF-16 character is a surrogate character (well-formedness is not validated).
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool IsFirstCharAtLeastThreeUtf8Bytes(uint value)
+        private static bool IsFirstCharAtLeastThreeUtf8Bytes(uint value)
         {
             // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0800..FFFF ].
             // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0800..FFFF ].
@@ -315,7 +325,7 @@ internal static bool IsFirstCharAtLeastThreeUtf8Bytes(uint value)
         /// returns true iff the first UTF-16 character is a surrogate character (either high or low).
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool IsFirstCharSurrogate(uint value)
+        private static bool IsFirstCharSurrogate(uint value)
         {
             // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ D800..DFFF ].
             // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ D800..DFFF ].
@@ -331,7 +341,7 @@ internal static bool IsFirstCharSurrogate(uint value)
         /// returns true iff the first UTF-16 character would be encoded as exactly 2 bytes in UTF-8.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool IsFirstCharTwoUtf8Bytes(uint value)
+        private static bool IsFirstCharTwoUtf8Bytes(uint value)
         {
             // Little-endian: Given [ #### AAAA ], return whether AAAA is in range [ 0080..07FF ].
             // Big-endian: Given [ AAAA #### ], return whether AAAA is in range [ 0080..07FF ].
@@ -351,7 +361,7 @@ internal static bool IsFirstCharTwoUtf8Bytes(uint value)
         /// is a UTF-8 continuation byte.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool IsLowByteUtf8ContinuationByte(uint value)
+        private static bool IsLowByteUtf8ContinuationByte(uint value)
         {
             // The JIT won't emit a single 8-bit signed cmp instruction (see IsUtf8ContinuationByte),
             // so the best we can do for now is the lea / cmp pair.
@@ -365,7 +375,7 @@ internal static bool IsLowByteUtf8ContinuationByte(uint value)
         /// returns true iff the second UTF-16 character is ASCII.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool IsSecondCharAscii(uint value)
+        private static bool IsSecondCharAscii(uint value)
         {
             // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0000..007F ].
             // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0000..007F ].
@@ -382,7 +392,7 @@ internal static bool IsSecondCharAscii(uint value)
         /// This also returns true if the second UTF-16 character is a surrogate character (well-formedness is not validated).
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool IsSecondCharAtLeastThreeUtf8Bytes(uint value)
+        private static bool IsSecondCharAtLeastThreeUtf8Bytes(uint value)
         {
             // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0800..FFFF ].
             // Big-endian: Given [ #### BBBB ], return whether ABBBBAAA is in range [ 0800..FFFF ].
@@ -398,7 +408,7 @@ internal static bool IsSecondCharAtLeastThreeUtf8Bytes(uint value)
         /// returns true iff the second UTF-16 character is a surrogate character (either high or low).
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool IsSecondCharSurrogate(uint value)
+        private static bool IsSecondCharSurrogate(uint value)
         {
             // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ D800..DFFF ].
             // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ D800..DFFF ].
@@ -414,7 +424,7 @@ internal static bool IsSecondCharSurrogate(uint value)
         /// returns true iff the second UTF-16 character would be encoded as exactly 2 bytes in UTF-8.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool IsSecondCharTwoUtf8Bytes(uint value)
+        private static bool IsSecondCharTwoUtf8Bytes(uint value)
         {
             // Little-endian: Given [ BBBB #### ], return whether BBBB is in range [ 0080..07FF ].
             // Big-endian: Given [ #### BBBB ], return whether BBBB is in range [ 0080..07FF ].
@@ -445,7 +455,7 @@ internal static bool IsUtf8ContinuationByte(in byte value)
             // The below check takes advantage of the two's complement representation of negative numbers.
             // [ 0b1000_0000, 0b1011_1111 ] is [ -127 (sbyte.MinValue), -65 ]
 
-            return ((sbyte)value < -64);
+            return (sbyte)value < -64;
         }
 
         /// <summary>
@@ -453,7 +463,7 @@ internal static bool IsUtf8ContinuationByte(in byte value)
         /// returns true iff the two characters represent a well-formed UTF-16 surrogate pair.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool IsWellFormedUtf16SurrogatePair(uint value)
+        private static bool IsWellFormedUtf16SurrogatePair(uint value)
         {
             // Little-endian: Given [ LLLL HHHH ], validate that LLLL in [ DC00..DFFF ] and HHHH in [ D800..DBFF ].
             // Big-endian: Given [ HHHH LLLL ], validate that HHHH in [ D800..DBFF ] and LLLL in [ DC00..DFFF ].
@@ -474,7 +484,7 @@ internal static bool IsWellFormedUtf16SurrogatePair(uint value)
         /// Converts a DWORD from machine-endian to little-endian.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static uint ToLittleEndian(uint value)
+        private static uint ToLittleEndian(uint value)
         {
             if (BitConverter.IsLittleEndian)
             {
@@ -494,7 +504,7 @@ internal static uint ToLittleEndian(uint value)
         /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>).
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool UInt32BeginsWithOverlongUtf8TwoByteSequence(uint value)
+        private static bool UInt32BeginsWithOverlongUtf8TwoByteSequence(uint value)
         {
             // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input.
             Debug.Assert(UInt32BeginsWithUtf8TwoByteMask(value));
@@ -517,7 +527,7 @@ internal static bool UInt32BeginsWithOverlongUtf8TwoByteSequence(uint value)
         /// still perform overlong form or out-of-range checking.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool UInt32BeginsWithUtf8FourByteMask(uint value)
+        private static bool UInt32BeginsWithUtf8FourByteMask(uint value)
         {
             // The code in this method is equivalent to the code
             // below but is slightly more optimized.
@@ -549,7 +559,7 @@ internal static bool UInt32BeginsWithUtf8FourByteMask(uint value)
         /// overlong form or surrogate checking.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool UInt32BeginsWithUtf8ThreeByteMask(uint value)
+        private static bool UInt32BeginsWithUtf8ThreeByteMask(uint value)
         {
             // The code in this method is equivalent to the code
             // below but is slightly more optimized.
@@ -581,7 +591,7 @@ internal static bool UInt32BeginsWithUtf8ThreeByteMask(uint value)
         /// overlong form checking.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool UInt32BeginsWithUtf8TwoByteMask(uint value)
+        private static bool UInt32BeginsWithUtf8TwoByteMask(uint value)
         {
             // The code in this method is equivalent to the code
             // below but is slightly more optimized.
@@ -613,7 +623,7 @@ internal static bool UInt32BeginsWithUtf8TwoByteMask(uint value)
         /// 2-byte sequence mask (see <see cref="UInt32BeginsWithUtf8TwoByteMask"/>).
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool UInt32EndsWithOverlongUtf8TwoByteSequence(uint value)
+        private static bool UInt32EndsWithOverlongUtf8TwoByteSequence(uint value)
         {
             // ASSUMPTION: Caller has already checked the '110yyyyy 10xxxxxx' mask of the input.
             Debug.Assert(UInt32EndsWithUtf8TwoByteMask(value));
@@ -639,7 +649,7 @@ internal static bool UInt32EndsWithOverlongUtf8TwoByteSequence(uint value)
         /// overlong form checking.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool UInt32EndsWithUtf8TwoByteMask(uint value)
+        private static bool UInt32EndsWithUtf8TwoByteMask(uint value)
         {
             // The code in this method is equivalent to the code
             // below but is slightly more optimized.
@@ -670,7 +680,7 @@ internal static bool UInt32EndsWithUtf8TwoByteMask(uint value)
         /// single operation. Returns <see langword="false"/> if running on a big-endian machine.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
+        private static bool UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
         {
             // Per Table 3-7, valid 2-byte sequences are [ C2..DF ] [ 80..BF ].
             // In little-endian, that would be represented as:
@@ -695,7 +705,7 @@ internal static bool UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(uint v
         /// single operation. Returns <see langword="false"/> if running on a big-endian machine.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
+        private static bool UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(uint value)
         {
             // See comments in UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian.
 
@@ -712,7 +722,7 @@ internal static bool UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(uint val
         /// returns <see langword="true"/> iff the first byte of the buffer is ASCII.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool UInt32FirstByteIsAscii(uint value)
+        private static bool UInt32FirstByteIsAscii(uint value)
         {
             // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
 
@@ -725,7 +735,7 @@ internal static bool UInt32FirstByteIsAscii(uint value)
         /// returns <see langword="true"/> iff the fourth byte of the buffer is ASCII.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool UInt32FourthByteIsAscii(uint value)
+        private static bool UInt32FourthByteIsAscii(uint value)
         {
             // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
 
@@ -738,7 +748,7 @@ internal static bool UInt32FourthByteIsAscii(uint value)
         /// returns <see langword="true"/> iff the second byte of the buffer is ASCII.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool UInt32SecondByteIsAscii(uint value)
+        private static bool UInt32SecondByteIsAscii(uint value)
         {
             // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
 
@@ -751,7 +761,7 @@ internal static bool UInt32SecondByteIsAscii(uint value)
         /// returns <see langword="true"/> iff the third byte of the buffer is ASCII.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static bool UInt32ThirdByteIsAscii(uint value)
+        private static bool UInt32ThirdByteIsAscii(uint value)
         {
             // Return statement is written this way to work around https://github.com/dotnet/coreclr/issues/914.
 
@@ -759,12 +769,13 @@ internal static bool UInt32ThirdByteIsAscii(uint value)
                 || (!BitConverter.IsLittleEndian && (0u >= (value & 0x8000u)));
         }
 
+#if NETCOREAPP3_1
         /// <summary>
         /// Given a DWORD which represents a buffer of 4 ASCII bytes, widen each byte to a 16-bit WORD
         /// and writes the resulting QWORD into the destination with machine endianness.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static void Widen4AsciiBytesToCharsAndWrite(ref char outputBuffer, uint value)
+        private static void Widen4AsciiBytesToCharsAndWrite(ref char outputBuffer, uint value)
         {
             if (Bmi2.X64.IsSupported)
             {
@@ -795,6 +806,7 @@ internal static void Widen4AsciiBytesToCharsAndWrite(ref char outputBuffer, uint
                 }
             }
         }
+#endif
 
         /// <summary>
         /// Given a DWORD which represents a buffer of 2 packed UTF-16 values in machine endianess,
@@ -802,7 +814,7 @@ internal static void Widen4AsciiBytesToCharsAndWrite(ref char outputBuffer, uint
         /// resulting 6 bytes to the destination buffer.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static void WriteTwoUtf16CharsAsTwoUtf8ThreeByteSequences(ref byte outputBuffer, uint value)
+        private static void WriteTwoUtf16CharsAsTwoUtf8ThreeByteSequences(ref byte outputBuffer, uint value)
         {
             Debug.Assert(IsFirstCharAtLeastThreeUtf8Bytes(value) && !IsFirstCharSurrogate(value), "First half of value should've been 0800..D7FF or E000..FFFF");
             Debug.Assert(IsSecondCharAtLeastThreeUtf8Bytes(value) && !IsSecondCharSurrogate(value), "Second half of value should've been 0800..D7FF or E000..FFFF");
@@ -838,7 +850,7 @@ internal static void WriteTwoUtf16CharsAsTwoUtf8ThreeByteSequences(ref byte outp
         /// resulting 3 bytes to the destination buffer.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static void WriteFirstUtf16CharAsUtf8ThreeByteSequence(ref byte outputBuffer, uint value)
+        private static void WriteFirstUtf16CharAsUtf8ThreeByteSequence(ref byte outputBuffer, uint value)
         {
             Debug.Assert(IsFirstCharAtLeastThreeUtf8Bytes(value) && !IsFirstCharSurrogate(value), "First half of value should've been 0800..D7FF or E000..FFFF");
 
diff --git a/src/DotNetty.Common/Internal/Utf8Utility.Transcoding.Net.cs b/src/DotNetty.Common/Internal/Utf8Utility.Transcoding.Net.cs
new file mode 100644
index 000000000..c088f2b77
--- /dev/null
+++ b/src/DotNetty.Common/Internal/Utf8Utility.Transcoding.Net.cs
@@ -0,0 +1,1510 @@
+﻿// borrowed from https://github.com/dotnet/corefx/tree/release/3.1/src/Common/src/CoreLib/System/Text/Unicode
+
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#if NET
+using System;
+using System.Buffers;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.X86;
+
+namespace DotNetty.Common.Internal
+{
+    unsafe partial class Utf8Utility
+    {
+        // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where
+        // the next byte would have been consumed from / the next char would have been written to.
+        // inputLength in bytes, outputCharsRemaining in chars.
+        public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLength, char* pOutputBuffer, int outputCharsRemaining, out byte* pInputBufferRemaining, out char* pOutputBufferRemaining)
+        {
+            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
+            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
+
+            Debug.Assert(outputCharsRemaining >= 0, "Destination length must not be negative.");
+            Debug.Assert(pOutputBuffer != null || outputCharsRemaining == 0, "Destination length must be zero if destination buffer pointer is null.");
+
+            // First, try vectorized conversion.
+
+            {
+                nuint numElementsConverted = ASCIIUtility.WidenAsciiToUtf16(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputCharsRemaining));
+
+                pInputBuffer += numElementsConverted;
+                pOutputBuffer += numElementsConverted;
+
+                // Quick check - did we just end up consuming the entire input buffer?
+                // If so, short-circuit the remainder of the method.
+
+                if ((int)numElementsConverted == inputLength)
+                {
+                    pInputBufferRemaining = pInputBuffer;
+                    pOutputBufferRemaining = pOutputBuffer;
+                    return OperationStatus.Done;
+                }
+
+                inputLength -= (int)numElementsConverted;
+                outputCharsRemaining -= (int)numElementsConverted;
+            }
+
+            if (inputLength < sizeof(uint))
+            {
+                goto ProcessInputOfLessThanDWordSize;
+            }
+
+            byte* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - 4;
+
+            // Begin the main loop.
+
+#if DEBUG
+            byte* pLastBufferPosProcessed = null; // used for invariant checking in debug builds
+#endif
+
+            Debug.Assert(pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer);
+            do
+            {
+                // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar.
+
+                uint thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+            AfterReadDWord:
+
+#if DEBUG
+                Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read.");
+                pLastBufferPosProcessed = pInputBuffer;
+#endif
+                // First, check for the common case of all-ASCII bytes.
+
+                if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
+                {
+                    // We read an all-ASCII sequence.
+
+                    if (outputCharsRemaining < sizeof(uint))
+                    {
+                        goto ProcessRemainingBytesSlow; // running out of space, but may be able to write some data
+                    }
+
+                    ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref *pOutputBuffer, thisDWord);
+                    pInputBuffer += 4;
+                    pOutputBuffer += 4;
+                    outputCharsRemaining -= 4;
+
+                    // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII.
+                    // Below is basically unrolled loops with poor man's vectorization.
+
+                    uint remainingInputBytes = (uint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4;
+                    uint maxIters = Math.Min(remainingInputBytes, (uint)outputCharsRemaining) / (2 * sizeof(uint));
+                    uint secondDWord;
+                    int i;
+                    for (i = 0; (uint)i < maxIters; i++)
+                    {
+                        // Reading two DWORDs in parallel benchmarked faster than reading a single QWORD.
+
+                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                        secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + sizeof(uint));
+
+                        if (!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord | secondDWord))
+                        {
+                            goto LoopTerminatedEarlyDueToNonAsciiData;
+                        }
+
+                        pInputBuffer += 8;
+
+                        ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pOutputBuffer[0], thisDWord);
+                        ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pOutputBuffer[4], secondDWord);
+
+                        pOutputBuffer += 8;
+                    }
+
+                    outputCharsRemaining -= 8 * i;
+
+                    continue; // need to perform a bounds check because we might be running out of data
+
+                LoopTerminatedEarlyDueToNonAsciiData:
+
+                    if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
+                    {
+                        // The first DWORD contained all-ASCII bytes, so expand it.
+
+                        ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref *pOutputBuffer, thisDWord);
+
+                        // continue the outer loop from the second DWORD
+
+                        Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(secondDWord));
+                        thisDWord = secondDWord;
+
+                        pInputBuffer += 4;
+                        pOutputBuffer += 4;
+                        outputCharsRemaining -= 4;
+                    }
+
+                    outputCharsRemaining -= 8 * i;
+
+                    // We know that there's *at least* one DWORD of data remaining in the buffer.
+                    // We also know that it's not all-ASCII. We can skip the logic at the beginning of the main loop.
+
+                    goto AfterReadDWordSkipAllBytesAsciiCheck;
+                }
+
+            AfterReadDWordSkipAllBytesAsciiCheck:
+
+                Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)); // this should have been handled earlier
+
+                // Next, try stripping off ASCII bytes one at a time.
+                // We only handle up to three ASCII bytes here since we handled the four ASCII byte case above.
+
+                if (UInt32FirstByteIsAscii(thisDWord))
+                {
+                    if (outputCharsRemaining >= 3)
+                    {
+                        // Fast-track: we don't need to check the destination length for subsequent
+                        // ASCII bytes since we know we can write them all now.
+
+                        uint thisDWordLittleEndian = ToLittleEndian(thisDWord);
+
+                        nuint adjustment = 1;
+                        pOutputBuffer[0] = (char)(byte)thisDWordLittleEndian;
+
+                        if (UInt32SecondByteIsAscii(thisDWord))
+                        {
+                            adjustment++;
+                            thisDWordLittleEndian >>= 8;
+                            pOutputBuffer[1] = (char)(byte)thisDWordLittleEndian;
+
+                            if (UInt32ThirdByteIsAscii(thisDWord))
+                            {
+                                adjustment++;
+                                thisDWordLittleEndian >>= 8;
+                                pOutputBuffer[2] = (char)(byte)thisDWordLittleEndian;
+                            }
+                        }
+
+                        pInputBuffer += adjustment;
+                        pOutputBuffer += adjustment;
+                        outputCharsRemaining -= (int)adjustment;
+                    }
+                    else
+                    {
+                        // Slow-track: we need to make sure each individual write has enough
+                        // of a buffer so that we don't overrun the destination.
+
+                        if (outputCharsRemaining == 0)
+                        {
+                            goto OutputBufferTooSmall;
+                        }
+
+                        uint thisDWordLittleEndian = ToLittleEndian(thisDWord);
+
+                        pInputBuffer++;
+                        *pOutputBuffer++ = (char)(byte)thisDWordLittleEndian;
+                        outputCharsRemaining--;
+
+                        if (UInt32SecondByteIsAscii(thisDWord))
+                        {
+                            if (outputCharsRemaining == 0)
+                            {
+                                goto OutputBufferTooSmall;
+                            }
+
+                            pInputBuffer++;
+                            thisDWordLittleEndian >>= 8;
+                            *pOutputBuffer++ = (char)(byte)thisDWordLittleEndian;
+
+                            // We can perform a small optimization here. We know at this point that
+                            // the output buffer is fully consumed (we read two ASCII bytes and wrote
+                            // two ASCII chars, and we checked earlier that the destination buffer
+                            // can't store a third byte). If the next byte is ASCII, we can jump straight
+                            // to the return statement since the end-of-method logic only relies on the
+                            // destination buffer pointer -- NOT the output chars remaining count -- being
+                            // correct. If the next byte is not ASCII, we'll need to continue with the
+                            // rest of the main loop, but we can set the buffer length directly to zero
+                            // rather than decrementing it from 1 to 0.
+
+                            Debug.Assert(outputCharsRemaining == 1);
+
+                            if (UInt32ThirdByteIsAscii(thisDWord))
+                            {
+                                goto OutputBufferTooSmall;
+                            }
+                            else
+                            {
+                                outputCharsRemaining = 0;
+                            }
+                        }
+                    }
+
+                    if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+                    {
+                        goto ProcessRemainingBytesSlow; // input buffer doesn't contain enough data to read a DWORD
+                    }
+                    else
+                    {
+                        // The input buffer at the current offset contains a non-ASCII byte.
+                        // Read an entire DWORD and fall through to multi-byte consumption logic.
+                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                    }
+                }
+
+            BeforeProcessTwoByteSequence:
+
+                // At this point, we know we're working with a multi-byte code unit,
+                // but we haven't yet validated it.
+
+                // The masks and comparands are derived from the Unicode Standard, Table 3-6.
+                // Additionally, we need to check for valid byte sequences per Table 3-7.
+
+                // Check the 2-byte case.
+
+                if (UInt32BeginsWithUtf8TwoByteMask(thisDWord))
+                {
+                    // Per Table 3-7, valid sequences are:
+                    // [ C2..DF ] [ 80..BF ]
+
+                    if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
+                    {
+                        goto Error;
+                    }
+
+                ProcessTwoByteSequenceSkipOverlongFormCheck:
+
+                    // Optimization: If this is a two-byte-per-character language like Cyrillic or Hebrew,
+                    // there's a good chance that if we see one two-byte run then there's another two-byte
+                    // run immediately after. Let's check that now.
+
+                    // On little-endian platforms, we can check for the two-byte UTF8 mask *and* validate that
+                    // the value isn't overlong using a single comparison. On big-endian platforms, we'll need
+                    // to validate the mask and validate that the sequence isn't overlong as two separate comparisons.
+
+                    if ((BitConverter.IsLittleEndian && UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
+                        || (!BitConverter.IsLittleEndian && (UInt32EndsWithUtf8TwoByteMask(thisDWord) && !UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord))))
+                    {
+                        // We have two runs of two bytes each.
+
+                        if (outputCharsRemaining < 2)
+                        {
+                            goto ProcessRemainingBytesSlow; // running out of output buffer
+                        }
+
+                        Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractTwoCharsPackedFromTwoAdjacentTwoByteSequences(thisDWord));
+
+                        pInputBuffer += 4;
+                        pOutputBuffer += 2;
+                        outputCharsRemaining -= 2;
+
+                        if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+                        {
+                            // Optimization: If we read a long run of two-byte sequences, the next sequence is probably
+                            // also two bytes. Check for that first before going back to the beginning of the loop.
+
+                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+                            if (BitConverter.IsLittleEndian)
+                            {
+                                if (UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
+                                {
+                                    // The next sequence is a valid two-byte sequence.
+                                    goto ProcessTwoByteSequenceSkipOverlongFormCheck;
+                                }
+                            }
+                            else
+                            {
+                                if (UInt32BeginsWithUtf8TwoByteMask(thisDWord))
+                                {
+                                    if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
+                                    {
+                                        goto Error; // The next sequence purports to be a 2-byte sequence but is overlong.
+                                    }
+
+                                    goto ProcessTwoByteSequenceSkipOverlongFormCheck;
+                                }
+                            }
+
+                            // If we reached this point, the next sequence is something other than a valid
+                            // two-byte sequence, so go back to the beginning of the loop.
+                            goto AfterReadDWord;
+                        }
+                        else
+                        {
+                            goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
+                        }
+                    }
+
+                    // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence.
+                    // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
+                    // bytes are ASCII?
+
+                    uint charToWrite = ExtractCharFromFirstTwoByteSequence(thisDWord); // optimistically compute this now, but don't store until we know dest is large enough
+
+                    if (UInt32ThirdByteIsAscii(thisDWord))
+                    {
+                        if (UInt32FourthByteIsAscii(thisDWord))
+                        {
+                            if (outputCharsRemaining < 3)
+                            {
+                                goto ProcessRemainingBytesSlow; // running out of output buffer
+                            }
+
+                            pOutputBuffer[0] = (char)charToWrite;
+                            if (BitConverter.IsLittleEndian)
+                            {
+                                thisDWord >>= 16;
+                                pOutputBuffer[1] = (char)(byte)thisDWord;
+                                thisDWord >>= 8;
+                                pOutputBuffer[2] = (char)thisDWord;
+                            }
+                            else
+                            {
+                                pOutputBuffer[2] = (char)(byte)thisDWord;
+                                pOutputBuffer[1] = (char)(byte)(thisDWord >> 8);
+                            }
+                            pInputBuffer += 4;
+                            pOutputBuffer += 3;
+                            outputCharsRemaining -= 3;
+
+                            continue; // go back to original bounds check and check for ASCII
+                        }
+                        else
+                        {
+                            if (outputCharsRemaining < 2)
+                            {
+                                goto ProcessRemainingBytesSlow; // running out of output buffer
+                            }
+
+                            pOutputBuffer[0] = (char)charToWrite;
+                            pOutputBuffer[1] = (char)(byte)(thisDWord >> (BitConverter.IsLittleEndian ? 16 : 8));
+                            pInputBuffer += 3;
+                            pOutputBuffer += 2;
+                            outputCharsRemaining -= 2;
+
+                            // A two-byte sequence followed by an ASCII byte followed by a non-ASCII byte.
+                            // Read in the next DWORD and jump directly to the start of the multi-byte processing block.
+
+                            if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer)
+                            {
+                                goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
+                            }
+                            else
+                            {
+                                thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                                goto BeforeProcessTwoByteSequence;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        if (outputCharsRemaining == 0)
+                        {
+                            goto ProcessRemainingBytesSlow; // running out of output buffer
+                        }
+
+                        pOutputBuffer[0] = (char)charToWrite;
+                        pInputBuffer += 2;
+                        pOutputBuffer++;
+                        outputCharsRemaining--;
+
+                        if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer)
+                        {
+                            goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
+                        }
+                        else
+                        {
+                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                            goto BeforeProcessThreeByteSequence; // we know the next byte isn't ASCII, and it's not the start of a 2-byte sequence (this was checked above)
+                        }
+                    }
+                }
+
+            // Check the 3-byte case.
+
+            BeforeProcessThreeByteSequence:
+
+                if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
+                {
+                ProcessThreeByteSequenceWithCheck:
+
+                    // We need to check for overlong or surrogate three-byte sequences.
+                    //
+                    // Per Table 3-7, valid sequences are:
+                    // [   E0   ] [ A0..BF ] [ 80..BF ]
+                    // [ E1..EC ] [ 80..BF ] [ 80..BF ]
+                    // [   ED   ] [ 80..9F ] [ 80..BF ]
+                    // [ EE..EF ] [ 80..BF ] [ 80..BF ]
+                    //
+                    // Big-endian examples of using the above validation table:
+                    // E0A0 = 1110 0000 1010 0000 => invalid (overlong ) patterns are 1110 0000 100# ####
+                    // ED9F = 1110 1101 1001 1111 => invalid (surrogate) patterns are 1110 1101 101# ####
+                    // If using the bitmask ......................................... 0000 1111 0010 0000 (=0F20),
+                    // Then invalid (overlong) patterns match the comparand ......... 0000 0000 0000 0000 (=0000),
+                    // And invalid (surrogate) patterns match the comparand ......... 0000 1101 0010 0000 (=0D20).
+
+                    if (BitConverter.IsLittleEndian)
+                    {
+                        // The "overlong or surrogate" check can be implemented using a single jump, but there's
+                        // some overhead to moving the bits into the correct locations in order to perform the
+                        // correct comparison, and in practice the processor's branch prediction capability is
+                        // good enough that we shouldn't bother. So we'll use two jumps instead.
+
+                        // Can't extract this check into its own helper method because JITter produces suboptimal
+                        // assembly, even with aggressive inlining.
+
+                        // Code below becomes 5 instructions: test, jz, lea, test, jz
+
+                        if (((thisDWord & 0x0000_200Fu) == 0) || (((thisDWord - 0x0000_200Du) & 0x0000_200Fu) == 0))
+                        {
+                            goto Error; // overlong or surrogate
+                        }
+                    }
+                    else
+                    {
+                        if (((thisDWord & 0x0F20_0000u) == 0) || (((thisDWord - 0x0D20_0000u) & 0x0F20_0000u) == 0))
+                        {
+                            goto Error; // overlong or surrogate
+                        }
+                    }
+
+                    // At this point, we know the incoming scalar is well-formed.
+
+                    if (outputCharsRemaining == 0)
+                    {
+                        goto OutputBufferTooSmall; // not enough space in the destination buffer to write
+                    }
+
+                    // As an optimization, on compatible platforms check if a second three-byte sequence immediately
+                    // follows the one we just read, and if so extract them together.
+
+                    if (BitConverter.IsLittleEndian)
+                    {
+                        // First, check that the leftover byte from the original DWORD is in the range [ E0..EF ], which
+                        // would indicate the potential start of a second three-byte sequence.
+
+                        if (((thisDWord - 0xE000_0000u) & 0xF000_0000u) == 0)
+                        {
+                            // The const '3' below is correct because pFinalPosWhereCanReadDWordFromInputBuffer represents
+                            // the final place where we can safely perform a DWORD read, and we want to probe whether it's
+                            // safe to read a DWORD beginning at address &pInputBuffer[3].
+
+                            if (outputCharsRemaining > 1 && (nint)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 3)
+                            {
+                                // We're going to attempt to read a second 3-byte sequence and write them both out one after the other.
+                                // We need to check the continuation bit mask on the remaining two bytes (and we may as well check the leading
+                                // byte mask again since it's free), then perform overlong + surrogate checks. If the overlong or surrogate
+                                // checks fail, we'll fall through to the remainder of the logic which will transcode the original valid
+                                // 3-byte UTF-8 sequence we read; and on the next iteration of the loop the validation routine will run again,
+                                // fail, and redirect control flow to the error handling logic at the very end of this method.
+
+                                uint secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + 3);
+
+                                if (UInt32BeginsWithUtf8ThreeByteMask(secondDWord)
+                                    && ((secondDWord & 0x0000_200Fu) != 0)
+                                    && (((secondDWord - 0x0000_200Du) & 0x0000_200Fu) != 0))
+                                {
+                                    pOutputBuffer[0] = (char)ExtractCharFromFirstThreeByteSequence(thisDWord);
+                                    pOutputBuffer[1] = (char)ExtractCharFromFirstThreeByteSequence(secondDWord);
+                                    pInputBuffer += 6;
+                                    pOutputBuffer += 2;
+                                    outputCharsRemaining -= 2;
+
+                                    // Drain any ASCII data following the second three-byte sequence.
+
+                                    goto CheckForAsciiByteAfterThreeByteSequence;
+                                }
+                            }
+                        }
+                    }
+
+                    // Couldn't extract 2x three-byte sequences together, just do this one by itself.
+
+                    *pOutputBuffer = (char)ExtractCharFromFirstThreeByteSequence(thisDWord);
+                    pInputBuffer += 3;
+                    pOutputBuffer++;
+                    outputCharsRemaining--;
+
+                CheckForAsciiByteAfterThreeByteSequence:
+
+                    // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way
+                    // in to the text. If this happens strip it off now before seeing if the next character
+                    // consists of three code units.
+
+                    if (UInt32FourthByteIsAscii(thisDWord))
+                    {
+                        if (outputCharsRemaining == 0)
+                        {
+                            goto OutputBufferTooSmall;
+                        }
+
+                        if (BitConverter.IsLittleEndian)
+                        {
+                            *pOutputBuffer = (char)(thisDWord >> 24);
+                        }
+                        else
+                        {
+                            *pOutputBuffer = (char)(byte)thisDWord;
+                        }
+
+                        pInputBuffer++;
+                        pOutputBuffer++;
+                        outputCharsRemaining--;
+                    }
+
+                    if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
+                    {
+                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+                        // Optimization: A three-byte character could indicate CJK text, which makes it likely
+                        // that the character following this one is also CJK. We'll check for a three-byte sequence
+                        // marker now and jump directly to three-byte sequence processing if we see one, skipping
+                        // all of the logic at the beginning of the loop.
+
+                        if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
+                        {
+                            goto ProcessThreeByteSequenceWithCheck; // found a three-byte sequence marker; validate and consume
+                        }
+                        else
+                        {
+                            goto AfterReadDWord; // probably ASCII punctuation or whitespace
+                        }
+                    }
+                    else
+                    {
+                        goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
+                    }
+                }
+
+                // Assume the 4-byte case, but we need to validate.
+
+                {
+                    // We need to check for overlong or invalid (over U+10FFFF) four-byte sequences.
+                    //
+                    // Per Table 3-7, valid sequences are:
+                    // [   F0   ] [ 90..BF ] [ 80..BF ] [ 80..BF ]
+                    // [ F1..F3 ] [ 80..BF ] [ 80..BF ] [ 80..BF ]
+                    // [   F4   ] [ 80..8F ] [ 80..BF ] [ 80..BF ]
+
+                    if (!UInt32BeginsWithUtf8FourByteMask(thisDWord))
+                    {
+                        goto Error;
+                    }
+
+                    // Now check for overlong / out-of-range sequences.
+
+                    if (BitConverter.IsLittleEndian)
+                    {
+                        // The DWORD we read is [ 10xxxxxx 10yyyyyy 10zzzzzz 11110www ].
+                        // We want to get the 'w' byte in front of the 'z' byte so that we can perform
+                        // a single range comparison. We'll take advantage of the fact that the JITter
+                        // can detect a ROR / ROL operation, then we'll just zero out the bytes that
+                        // aren't involved in the range check.
+
+                        uint toCheck = thisDWord & 0x0000_FFFFu;
+
+                        // At this point, toCheck = [ 00000000 00000000 10zzzzzz 11110www ].
+
+                        toCheck = BitOperations.RotateRight(toCheck, 8);
+
+                        // At this point, toCheck = [ 11110www 00000000 00000000 10zzzzzz ].
+
+                        if (!UnicodeUtility.IsInRangeInclusive(toCheck, 0xF000_0090u, 0xF400_008Fu))
+                        {
+                            goto Error;
+                        }
+                    }
+                    else
+                    {
+                        if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0xF090_0000u, 0xF48F_FFFFu))
+                        {
+                            goto Error;
+                        }
+                    }
+
+                    // Validation complete.
+
+                    if (outputCharsRemaining < 2)
+                    {
+                        // There's no point to falling back to the "drain the input buffer" logic, since we know
+                        // we can't write anything to the destination. So we'll just exit immediately.
+                        goto OutputBufferTooSmall;
+                    }
+
+                    Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractCharsFromFourByteSequence(thisDWord));
+
+                    pInputBuffer += 4;
+                    pOutputBuffer += 2;
+                    outputCharsRemaining -= 2;
+
+                    continue; // go back to beginning of loop for processing
+                }
+            } while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer);
+
+        ProcessRemainingBytesSlow:
+            inputLength = (int)(void*)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4;
+
+        ProcessInputOfLessThanDWordSize:
+            while (inputLength > 0)
+            {
+                uint firstByte = pInputBuffer[0];
+                if (firstByte <= 0x7Fu)
+                {
+                    if (outputCharsRemaining == 0)
+                    {
+                        goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+                    }
+
+                    // 1-byte (ASCII) case
+                    *pOutputBuffer = (char)firstByte;
+
+                    pInputBuffer++;
+                    pOutputBuffer++;
+                    inputLength--;
+                    outputCharsRemaining--;
+                    continue;
+                }
+
+                // Potentially the start of a multi-byte sequence?
+
+                firstByte -= 0xC2u;
+                if ((byte)firstByte <= (0xDFu - 0xC2u))
+                {
+                    // Potentially a 2-byte sequence?
+                    if (inputLength < 2)
+                    {
+                        goto InputBufferTooSmall; // out of data
+                    }
+
+                    uint secondByte = pInputBuffer[1];
+                    if (!IsLowByteUtf8ContinuationByte(secondByte))
+                    {
+                        goto Error; // 2-byte marker not followed by continuation byte
+                    }
+
+                    if (outputCharsRemaining == 0)
+                    {
+                        goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+                    }
+
+                    uint asChar = (firstByte << 6) + secondByte + ((0xC2u - 0xC0u) << 6) - 0x80u; // remove UTF-8 markers from scalar
+                    *pOutputBuffer = (char)asChar;
+
+                    pInputBuffer += 2;
+                    pOutputBuffer++;
+                    inputLength -= 2;
+                    outputCharsRemaining--;
+                    continue;
+                }
+                else if ((byte)firstByte <= (0xEFu - 0xC2u))
+                {
+                    // Potentially a 3-byte sequence?
+                    if (inputLength >= 3)
+                    {
+                        uint secondByte = pInputBuffer[1];
+                        uint thirdByte = pInputBuffer[2];
+                        if (!IsLowByteUtf8ContinuationByte(secondByte) || !IsLowByteUtf8ContinuationByte(thirdByte))
+                        {
+                            goto Error; // 3-byte marker not followed by 2 continuation bytes
+                        }
+
+                        // To speed up the validation logic below, we're not going to remove the UTF-8 markers from the partial char just yet.
+                        // We account for this in the comparisons below.
+
+                        uint partialChar = (firstByte << 12) + (secondByte << 6);
+                        if (partialChar < ((0xE0u - 0xC2u) << 12) + (0xA0u << 6))
+                        {
+                            goto Error; // this is an overlong encoding; fail
+                        }
+
+                        partialChar -= ((0xEDu - 0xC2u) << 12) + (0xA0u << 6); // if partialChar = 0, we're at beginning of UTF-16 surrogate code point range
+                        if (partialChar < 0x0800u /* number of code points in UTF-16 surrogate code point range */)
+                        {
+                            goto Error; // attempted to encode a UTF-16 surrogate code point; fail
+                        }
+
+                        if (outputCharsRemaining == 0)
+                        {
+                            goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+                        }
+
+                        // Now restore the full scalar value.
+
+                        partialChar += thirdByte;
+                        partialChar += 0xD800; // undo "move to beginning of UTF-16 surrogate code point range" from earlier, fold it with later adds
+                        partialChar -= 0x80u; // remove third byte continuation marker
+
+                        *pOutputBuffer = (char)partialChar;
+
+                        pInputBuffer += 3;
+                        pOutputBuffer++;
+                        inputLength -= 3;
+                        outputCharsRemaining--;
+                        continue;
+                    }
+                    else if (inputLength >= 2)
+                    {
+                        uint secondByte = pInputBuffer[1];
+                        if (!IsLowByteUtf8ContinuationByte(secondByte))
+                        {
+                            goto Error; // 3-byte marker not followed by continuation byte
+                        }
+
+                        // We can't build up the entire scalar value now, but we can check for overlong / surrogate representations
+                        // from just the first two bytes.
+
+                        uint partialChar = (firstByte << 6) + secondByte; // don't worry about fixing up the UTF-8 markers; we'll account for it in the below comparison
+                        if (partialChar < ((0xE0u - 0xC2u) << 6) + 0xA0u)
+                        {
+                            goto Error; // failed overlong check
+                        }
+                        if (UnicodeUtility.IsInRangeInclusive(partialChar, ((0xEDu - 0xC2u) << 6) + 0xA0u, ((0xEEu - 0xC2u) << 6) + 0x7Fu))
+                        {
+                            goto Error; // failed surrogate check
+                        }
+                    }
+
+                    goto InputBufferTooSmall; // out of data
+                }
+                else if ((byte)firstByte <= (0xF4u - 0xC2u))
+                {
+                    // Potentially a 4-byte sequence?
+
+                    if (inputLength < 2)
+                    {
+                        goto InputBufferTooSmall; // ran out of data
+                    }
+
+                    uint nextByte = pInputBuffer[1];
+                    if (!IsLowByteUtf8ContinuationByte(nextByte))
+                    {
+                        goto Error; // 4-byte marker not followed by a continuation byte
+                    }
+
+                    uint asPartialChar = (firstByte << 6) + nextByte; // don't worry about fixing up the UTF-8 markers; we'll account for it in the below comparison
+                    if (!UnicodeUtility.IsInRangeInclusive(asPartialChar, ((0xF0u - 0xC2u) << 6) + 0x90u, ((0xF4u - 0xC2u) << 6) + 0x8Fu))
+                    {
+                        goto Error; // failed overlong / out-of-range check
+                    }
+
+                    if (inputLength < 3)
+                    {
+                        goto InputBufferTooSmall; // ran out of data
+                    }
+
+                    if (!IsLowByteUtf8ContinuationByte(pInputBuffer[2]))
+                    {
+                        goto Error; // third byte in 4-byte sequence not a continuation byte
+                    }
+
+                    if (inputLength < 4)
+                    {
+                        goto InputBufferTooSmall; // ran out of data
+                    }
+
+                    if (!IsLowByteUtf8ContinuationByte(pInputBuffer[3]))
+                    {
+                        goto Error; // fourth byte in 4-byte sequence not a continuation byte
+                    }
+
+                    // If we read a valid astral scalar value, the only way we could've fallen down this code path
+                    // is that we didn't have enough output buffer to write the result.
+
+                    goto OutputBufferTooSmall;
+                }
+                else
+                {
+                    goto Error; // didn't begin with [ C2 .. F4 ], so invalid multi-byte sequence header byte
+                }
+            }
+
+            OperationStatus retVal = OperationStatus.Done;
+            goto ReturnCommon;
+
+        InputBufferTooSmall:
+            retVal = OperationStatus.NeedMoreData;
+            goto ReturnCommon;
+
+        OutputBufferTooSmall:
+            retVal = OperationStatus.DestinationTooSmall;
+            goto ReturnCommon;
+
+        Error:
+            retVal = OperationStatus.InvalidData;
+            goto ReturnCommon;
+
+        ReturnCommon:
+            pInputBufferRemaining = pInputBuffer;
+            pOutputBufferRemaining = pOutputBuffer;
+            return retVal;
+        }
+
+        // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where
+        // the next char would have been consumed from / the next byte would have been written to.
+        // inputLength in chars, outputBytesRemaining in bytes.
+        public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLength, byte* pOutputBuffer, int outputBytesRemaining, out char* pInputBufferRemaining, out byte* pOutputBufferRemaining)
+        {
+            const int CharsPerDWord = sizeof(uint) / sizeof(char);
+
+            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
+            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
+
+            Debug.Assert(outputBytesRemaining >= 0, "Destination length must not be negative.");
+            Debug.Assert(pOutputBuffer != null || outputBytesRemaining == 0, "Destination length must be zero if destination buffer pointer is null.");
+
+            // First, try vectorized conversion.
+
+            {
+                nuint numElementsConverted = ASCIIUtility.NarrowUtf16ToAscii(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputBytesRemaining));
+
+                pInputBuffer += numElementsConverted;
+                pOutputBuffer += numElementsConverted;
+
+                // Quick check - did we just end up consuming the entire input buffer?
+                // If so, short-circuit the remainder of the method.
+
+                if ((int)numElementsConverted == inputLength)
+                {
+                    pInputBufferRemaining = pInputBuffer;
+                    pOutputBufferRemaining = pOutputBuffer;
+                    return OperationStatus.Done;
+                }
+
+                inputLength -= (int)numElementsConverted;
+                outputBytesRemaining -= (int)numElementsConverted;
+            }
+
+            if (inputLength < CharsPerDWord)
+            {
+                goto ProcessInputOfLessThanDWordSize;
+            }
+
+            char* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - CharsPerDWord;
+
+            // We have paths for SSE4.1 vectorization inside the inner loop. Since the below
+            // vector is only used in those code paths, we leave it uninitialized if SSE4.1
+            // is not enabled.
+
+            Unsafe.SkipInit(out Vector128<short> nonAsciiUtf16DataMask);
+            if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian))
+            {
+                nonAsciiUtf16DataMask = Vector128.Create(unchecked((short)0xFF80)); // mask of non-ASCII bits in a UTF-16 char
+            }
+
+            // Begin the main loop.
+
+#if DEBUG
+            char* pLastBufferPosProcessed = null; // used for invariant checking in debug builds
+#endif
+
+            uint thisDWord;
+
+            Debug.Assert(pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer);
+            do
+            {
+                // Read 32 bits at a time. This is enough to hold any possible UTF16-encoded scalar.
+
+                thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+            AfterReadDWord:
+
+#if DEBUG
+                Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read.");
+                pLastBufferPosProcessed = pInputBuffer;
+#endif
+
+                // First, check for the common case of all-ASCII chars.
+
+                if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
+                {
+                    // We read an all-ASCII sequence (2 chars).
+
+                    if (outputBytesRemaining < 2)
+                    {
+                        goto ProcessOneCharFromCurrentDWordAndFinish; // running out of space, but may be able to write some data
+                    }
+
+                    // The high WORD of the local declared below might be populated with garbage
+                    // as a result of our shifts below, but that's ok since we're only going to
+                    // write the low WORD.
+                    //
+                    // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
+                    // (Same logic works regardless of endianness.)
+                    uint valueToWrite = thisDWord | (thisDWord >> 8);
+
+                    Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)valueToWrite);
+
+                    pInputBuffer += 2;
+                    pOutputBuffer += 2;
+                    outputBytesRemaining -= 2;
+
+                    // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII.
+                    // Below is basically unrolled loops with poor man's vectorization.
+
+                    uint inputCharsRemaining = (uint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + 2;
+                    uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining);
+
+                    if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian))
+                    {
+                        // Try reading and writing 8 elements per iteration.
+                        uint maxIters = minElementsRemaining / 8;
+                        ulong possibleNonAsciiQWord;
+                        int i;
+                        Vector128<short> utf16Data;
+                        for (i = 0; (uint)i < maxIters; i++)
+                        {
+                            utf16Data = Unsafe.ReadUnaligned<Vector128<short>>(pInputBuffer);
+
+                            if (AdvSimd.IsSupported)
+                            {
+                                Vector128<short> isUtf16DataNonAscii = AdvSimd.CompareTest(utf16Data, nonAsciiUtf16DataMask);
+                                bool hasNonAsciiDataInVector = AdvSimd.Arm64.MinPairwise(isUtf16DataNonAscii, isUtf16DataNonAscii).AsUInt64().ToScalar() != 0;
+
+                                if (hasNonAsciiDataInVector)
+                                {
+                                    goto LoopTerminatedDueToNonAsciiDataInVectorLocal;
+                                }
+
+                                Vector64<byte> lower = AdvSimd.ExtractNarrowingSaturateUnsignedLower(utf16Data);
+                                AdvSimd.Store(pOutputBuffer, lower);
+                            }
+                            else
+                            {
+                                if (!Sse41.TestZ(utf16Data, nonAsciiUtf16DataMask))
+                                {
+                                    goto LoopTerminatedDueToNonAsciiDataInVectorLocal;
+                                }
+
+                                // narrow and write
+                                Sse2.StoreScalar((ulong*)pOutputBuffer /* unaligned */, Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt64());
+                            }
+
+                            pInputBuffer += 8;
+                            pOutputBuffer += 8;
+                        }
+
+                        outputBytesRemaining -= 8 * i;
+
+                        // Can we perform one more iteration, but reading & writing 4 elements instead of 8?
+
+                        if ((minElementsRemaining & 4) != 0)
+                        {
+                            possibleNonAsciiQWord = Unsafe.ReadUnaligned<ulong>(pInputBuffer);
+                            if (!Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord))
+                            {
+                                goto LoopTerminatedDueToNonAsciiDataInPossibleNonAsciiQWordLocal;
+                            }
+
+                            utf16Data = Vector128.CreateScalarUnsafe(possibleNonAsciiQWord).AsInt16();
+
+                            if (AdvSimd.IsSupported)
+                            {
+                                Vector64<byte> lower = AdvSimd.ExtractNarrowingSaturateUnsignedLower(utf16Data);
+                                AdvSimd.StoreSelectedScalar((uint*)pOutputBuffer, lower.AsUInt32(), 0);
+                            }
+                            else
+                            {
+                                Unsafe.WriteUnaligned<uint>(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32()));
+                            }
+
+                            pInputBuffer += 4;
+                            pOutputBuffer += 4;
+                            outputBytesRemaining -= 4;
+                        }
+
+                        continue; // Go back to beginning of main loop, read data, check for ASCII
+
+                    LoopTerminatedDueToNonAsciiDataInVectorLocal:
+
+                        outputBytesRemaining -= 8 * i;
+
+                        if (Sse2.X64.IsSupported)
+                        {
+                            possibleNonAsciiQWord = Sse2.X64.ConvertToUInt64(utf16Data.AsUInt64());
+                        }
+                        else
+                        {
+                            possibleNonAsciiQWord = utf16Data.AsUInt64().ToScalar();
+                        }
+
+                        // Temporarily set 'possibleNonAsciiQWord' to be the low 64 bits of the vector,
+                        // then check whether it's all-ASCII. If so, narrow and write to the destination
+                        // buffer. Since we know that either the high 64 bits or the low 64 bits of the
+                        // vector contains non-ASCII data, by the end of the following block the
+                        // 'possibleNonAsciiQWord' local is guaranteed to contain the non-ASCII segment.
+
+                        if (Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord)) // all chars in first QWORD are ASCII
+                        {
+                            if (AdvSimd.IsSupported)
+                            {
+                                Vector64<byte> lower = AdvSimd.ExtractNarrowingSaturateUnsignedLower(utf16Data);
+                                AdvSimd.StoreSelectedScalar((uint*)pOutputBuffer, lower.AsUInt32(), 0);
+                            }
+                            else
+                            {
+                                Unsafe.WriteUnaligned<uint>(pOutputBuffer, Sse2.ConvertToUInt32(Sse2.PackUnsignedSaturate(utf16Data, utf16Data).AsUInt32()));
+                            }
+                            pInputBuffer += 4;
+                            pOutputBuffer += 4;
+                            outputBytesRemaining -= 4;
+                            possibleNonAsciiQWord = utf16Data.AsUInt64().GetElement(1);
+                        }
+
+                    LoopTerminatedDueToNonAsciiDataInPossibleNonAsciiQWordLocal:
+
+                        Debug.Assert(!Utf16Utility.AllCharsInUInt64AreAscii(possibleNonAsciiQWord)); // this condition should've been checked earlier
+
+                        thisDWord = (uint)possibleNonAsciiQWord;
+                        if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
+                        {
+                            // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
+                            Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8)));
+                            pInputBuffer += 2;
+                            pOutputBuffer += 2;
+                            outputBytesRemaining -= 2;
+                            thisDWord = (uint)(possibleNonAsciiQWord >> 32);
+                        }
+
+                        goto AfterReadDWordSkipAllCharsAsciiCheck;
+                    }
+                    else
+                    {
+                        // Can't use SSE41 x64, so we'll only read and write 4 elements per iteration.
+                        uint maxIters = minElementsRemaining / 4;
+                        uint secondDWord;
+                        int i;
+                        for (i = 0; (uint)i < maxIters; i++)
+                        {
+                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                            secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + 2);
+
+                            if (!Utf16Utility.AllCharsInUInt32AreAscii(thisDWord | secondDWord))
+                            {
+                                goto LoopTerminatedDueToNonAsciiData;
+                            }
+
+                            // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
+                            // (Same logic works regardless of endianness.)
+                            Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8)));
+                            Unsafe.WriteUnaligned<ushort>(pOutputBuffer + 2, (ushort)(secondDWord | (secondDWord >> 8)));
+
+                            pInputBuffer += 4;
+                            pOutputBuffer += 4;
+                        }
+
+                        outputBytesRemaining -= 4 * i;
+
+                        continue; // Go back to beginning of main loop, read data, check for ASCII
+
+                    LoopTerminatedDueToNonAsciiData:
+
+                        outputBytesRemaining -= 4 * i;
+
+                        // First, see if we can drain any ASCII data from the first DWORD.
+
+                        if (Utf16Utility.AllCharsInUInt32AreAscii(thisDWord))
+                        {
+                            // [ 00000000 0bbbbbbb | 00000000 0aaaaaaa ] -> [ 00000000 0bbbbbbb | 0bbbbbbb 0aaaaaaa ]
+                            // (Same logic works regardless of endianness.)
+                            Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)(thisDWord | (thisDWord >> 8)));
+                            pInputBuffer += 2;
+                            pOutputBuffer += 2;
+                            outputBytesRemaining -= 2;
+                            thisDWord = secondDWord;
+                        }
+
+                        goto AfterReadDWordSkipAllCharsAsciiCheck;
+                    }
+                }
+
+            AfterReadDWordSkipAllCharsAsciiCheck:
+
+                Debug.Assert(!Utf16Utility.AllCharsInUInt32AreAscii(thisDWord)); // this should have been handled earlier
+
+                // Next, try stripping off the first ASCII char if it exists.
+                // We don't check for a second ASCII char since that should have been handled above.
+
+                if (IsFirstCharAscii(thisDWord))
+                {
+                    if (outputBytesRemaining == 0)
+                    {
+                        goto OutputBufferTooSmall;
+                    }
+
+                    if (BitConverter.IsLittleEndian)
+                    {
+                        pOutputBuffer[0] = (byte)thisDWord; // extract [ ## ## 00 AA ]
+                    }
+                    else
+                    {
+                        pOutputBuffer[0] = (byte)(thisDWord >> 24); // extract [ AA 00 ## ## ]
+                    }
+
+                    pInputBuffer++;
+                    pOutputBuffer++;
+                    outputBytesRemaining--;
+
+                    if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+                    {
+                        goto ProcessNextCharAndFinish; // input buffer doesn't contain enough data to read a DWORD
+                    }
+                    else
+                    {
+                        // The input buffer at the current offset contains a non-ASCII char.
+                        // Read an entire DWORD and fall through to non-ASCII consumption logic.
+                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                    }
+                }
+
+                // At this point, we know the first char in the buffer is non-ASCII, but we haven't yet validated it.
+
+                if (!IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
+                {
+                TryConsumeMultipleTwoByteSequences:
+
+                    // For certain text (Greek, Cyrillic, ...), 2-byte sequences tend to be clustered. We'll try transcoding them in
+                    // a tight loop without falling back to the main loop.
+
+                    if (IsSecondCharTwoUtf8Bytes(thisDWord))
+                    {
+                        // We have two runs of two bytes each.
+
+                        if (outputBytesRemaining < 4)
+                        {
+                            goto ProcessOneCharFromCurrentDWordAndFinish; // running out of output buffer
+                        }
+
+                        Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractTwoUtf8TwoByteSequencesFromTwoPackedUtf16Chars(thisDWord));
+
+                        pInputBuffer += 2;
+                        pOutputBuffer += 4;
+                        outputBytesRemaining -= 4;
+
+                        if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+                        {
+                            goto ProcessNextCharAndFinish; // Running out of data - go down slow path
+                        }
+                        else
+                        {
+                            // Optimization: If we read a long run of two-byte sequences, the next sequence is probably
+                            // also two bytes. Check for that first before going back to the beginning of the loop.
+
+                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+                            if (IsFirstCharTwoUtf8Bytes(thisDWord))
+                            {
+                                // Validated we have a two-byte sequence coming up
+                                goto TryConsumeMultipleTwoByteSequences;
+                            }
+
+                            // If we reached this point, the next sequence is something other than a valid
+                            // two-byte sequence, so go back to the beginning of the loop.
+                            goto AfterReadDWord;
+                        }
+                    }
+
+                    if (outputBytesRemaining < 2)
+                    {
+                        goto OutputBufferTooSmall;
+                    }
+
+                    Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)ExtractUtf8TwoByteSequenceFromFirstUtf16Char(thisDWord));
+
+                    // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence.
+                    // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
+                    // char is ASCII?
+
+                    if (IsSecondCharAscii(thisDWord))
+                    {
+                        if (outputBytesRemaining >= 3)
+                        {
+                            if (BitConverter.IsLittleEndian)
+                            {
+                                thisDWord >>= 16;
+                            }
+                            pOutputBuffer[2] = (byte)thisDWord;
+
+                            pInputBuffer += 2;
+                            pOutputBuffer += 3;
+                            outputBytesRemaining -= 3;
+
+                            continue; // go back to original bounds check and check for ASCII
+                        }
+                        else
+                        {
+                            pInputBuffer++;
+                            pOutputBuffer += 2;
+                            goto OutputBufferTooSmall;
+                        }
+                    }
+                    else
+                    {
+                        pInputBuffer++;
+                        pOutputBuffer += 2;
+                        outputBytesRemaining -= 2;
+
+                        if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+                        {
+                            goto ProcessNextCharAndFinish; // Running out of data - go down slow path
+                        }
+                        else
+                        {
+                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                            goto BeforeProcessThreeByteSequence; // we know the next byte isn't ASCII, and it's not the start of a 2-byte sequence (this was checked above)
+                        }
+                    }
+                }
+
+            // Check the 3-byte case.
+
+            BeforeProcessThreeByteSequence:
+
+                if (!IsFirstCharSurrogate(thisDWord))
+                {
+                    // Optimization: A three-byte character could indicate CJK text, which makes it likely
+                    // that the character following this one is also CJK. We'll perform the check now
+                    // rather than jumping to the beginning of the main loop.
+
+                    if (IsSecondCharAtLeastThreeUtf8Bytes(thisDWord))
+                    {
+                        if (!IsSecondCharSurrogate(thisDWord))
+                        {
+                            if (outputBytesRemaining < 6)
+                            {
+                                goto ConsumeSingleThreeByteRun; // not enough space - try consuming as much as we can
+                            }
+
+                            WriteTwoUtf16CharsAsTwoUtf8ThreeByteSequences(ref *pOutputBuffer, thisDWord);
+
+                            pInputBuffer += 2;
+                            pOutputBuffer += 6;
+                            outputBytesRemaining -= 6;
+
+                            // Try to remain in the 3-byte processing loop if at all possible.
+
+                            if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+                            {
+                                goto ProcessNextCharAndFinish; // Running out of data - go down slow path
+                            }
+                            else
+                            {
+                                thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+                                if (IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
+                                {
+                                    goto BeforeProcessThreeByteSequence;
+                                }
+                                else
+                                {
+                                    // Fall back to standard processing loop since we don't know how to optimize this.
+                                    goto AfterReadDWord;
+                                }
+                            }
+                        }
+                    }
+
+                ConsumeSingleThreeByteRun:
+
+                    if (outputBytesRemaining < 3)
+                    {
+                        goto OutputBufferTooSmall;
+                    }
+
+                    WriteFirstUtf16CharAsUtf8ThreeByteSequence(ref *pOutputBuffer, thisDWord);
+
+                    pInputBuffer++;
+                    pOutputBuffer += 3;
+                    outputBytesRemaining -= 3;
+
+                    // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way
+                    // in to the text. If this happens strip it off now before seeing if the next character
+                    // consists of three code units.
+
+                    if (IsSecondCharAscii(thisDWord))
+                    {
+                        if (outputBytesRemaining == 0)
+                        {
+                            goto OutputBufferTooSmall;
+                        }
+
+                        if (BitConverter.IsLittleEndian)
+                        {
+                            *pOutputBuffer = (byte)(thisDWord >> 16);
+                        }
+                        else
+                        {
+                            *pOutputBuffer = (byte)(thisDWord);
+                        }
+
+                        pInputBuffer++;
+                        pOutputBuffer++;
+                        outputBytesRemaining--;
+
+                        if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+                        {
+                            goto ProcessNextCharAndFinish; // Running out of data - go down slow path
+                        }
+                        else
+                        {
+                            thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+
+                            if (IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
+                            {
+                                goto BeforeProcessThreeByteSequence;
+                            }
+                            else
+                            {
+                                // Fall back to standard processing loop since we don't know how to optimize this.
+                                goto AfterReadDWord;
+                            }
+                        }
+                    }
+
+                    if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
+                    {
+                        goto ProcessNextCharAndFinish; // Running out of data - go down slow path
+                    }
+                    else
+                    {
+                        thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
+                        goto AfterReadDWordSkipAllCharsAsciiCheck; // we just checked above that this value isn't ASCII
+                    }
+                }
+
+                // Four byte sequence processing
+
+                if (IsWellFormedUtf16SurrogatePair(thisDWord))
+                {
+                    if (outputBytesRemaining < 4)
+                    {
+                        goto OutputBufferTooSmall;
+                    }
+
+                    Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractFourUtf8BytesFromSurrogatePair(thisDWord));
+
+                    pInputBuffer += 2;
+                    pOutputBuffer += 4;
+                    outputBytesRemaining -= 4;
+
+                    continue; // go back to beginning of loop for processing
+                }
+
+                goto Error; // an ill-formed surrogate sequence: high not followed by low, or low not preceded by high
+            } while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer);
+
+        ProcessNextCharAndFinish:
+            inputLength = (int)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + CharsPerDWord;
+
+        ProcessInputOfLessThanDWordSize:
+            Debug.Assert(inputLength < CharsPerDWord);
+
+            if (inputLength == 0)
+            {
+                goto InputBufferFullyConsumed;
+            }
+
+            uint thisChar = *pInputBuffer;
+            goto ProcessFinalChar;
+
+        ProcessOneCharFromCurrentDWordAndFinish:
+            if (BitConverter.IsLittleEndian)
+            {
+                thisChar = thisDWord & 0xFFFFu; // preserve only the first char
+            }
+            else
+            {
+                thisChar = thisDWord >> 16; // preserve only the first char
+            }
+
+        ProcessFinalChar:
+            {
+                if (thisChar <= 0x7Fu)
+                {
+                    if (outputBytesRemaining == 0)
+                    {
+                        goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+                    }
+
+                    // 1-byte (ASCII) case
+                    *pOutputBuffer = (byte)thisChar;
+
+                    pInputBuffer++;
+                    pOutputBuffer++;
+                }
+                else if (thisChar < 0x0800u)
+                {
+                    if (outputBytesRemaining < 2)
+                    {
+                        goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+                    }
+
+                    // 2-byte case
+                    pOutputBuffer[1] = (byte)((thisChar & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10xxxxxx ]
+                    pOutputBuffer[0] = (byte)((thisChar >> 6) | unchecked((uint)(sbyte)0xC0)); // [ 110yyyyy ]
+
+                    pInputBuffer++;
+                    pOutputBuffer += 2;
+                }
+                else if (!UnicodeUtility.IsSurrogateCodePoint(thisChar))
+                {
+                    if (outputBytesRemaining < 3)
+                    {
+                        goto OutputBufferTooSmall; // we have no hope of writing anything to the output
+                    }
+
+                    // 3-byte case
+                    pOutputBuffer[2] = (byte)((thisChar & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10xxxxxx ]
+                    pOutputBuffer[1] = (byte)(((thisChar >> 6) & 0x3Fu) | unchecked((uint)(sbyte)0x80)); // [ 10yyyyyy ]
+                    pOutputBuffer[0] = (byte)((thisChar >> 12) | unchecked((uint)(sbyte)0xE0)); // [ 1110zzzz ]
+
+                    pInputBuffer++;
+                    pOutputBuffer += 3;
+                }
+                else if (thisChar <= 0xDBFFu)
+                {
+                    // UTF-16 high surrogate code point with no trailing data, report incomplete input buffer
+                    goto InputBufferTooSmall;
+                }
+                else
+                {
+                    // UTF-16 low surrogate code point with no leading data, report error
+                    goto Error;
+                }
+            }
+
+            // There are two ways we can end up here. Either we were running low on input data,
+            // or we were running low on space in the destination buffer. If we're running low on
+            // input data (label targets ProcessInputOfLessThanDWordSize and ProcessNextCharAndFinish),
+            // then the inputLength value is guaranteed to be between 0 and 1, and we should return Done.
+            // If we're running low on destination buffer space (label target ProcessOneCharFromCurrentDWordAndFinish),
+            // then we didn't modify inputLength since entering the main loop, which means it should
+            // still have a value of >= 2. So checking the value of inputLength is all we need to do to determine
+            // which of the two scenarios we're in.
+
+            if (inputLength > 1)
+            {
+                goto OutputBufferTooSmall;
+            }
+
+        InputBufferFullyConsumed:
+            OperationStatus retVal = OperationStatus.Done;
+            goto ReturnCommon;
+
+        InputBufferTooSmall:
+            retVal = OperationStatus.NeedMoreData;
+            goto ReturnCommon;
+
+        OutputBufferTooSmall:
+            retVal = OperationStatus.DestinationTooSmall;
+            goto ReturnCommon;
+
+        Error:
+            retVal = OperationStatus.InvalidData;
+            goto ReturnCommon;
+
+        ReturnCommon:
+            pInputBufferRemaining = pInputBuffer;
+            pOutputBufferRemaining = pOutputBuffer;
+            return retVal;
+        }
+    }
+}
+#endif
diff --git a/src/DotNetty.Common/Internal/Utf8Utility.Transcoding.cs b/src/DotNetty.Common/Internal/Utf8Utility.Transcoding.NetCore3.cs
similarity index 93%
rename from src/DotNetty.Common/Internal/Utf8Utility.Transcoding.cs
rename to src/DotNetty.Common/Internal/Utf8Utility.Transcoding.NetCore3.cs
index 65670a61b..78c6629c8 100644
--- a/src/DotNetty.Common/Internal/Utf8Utility.Transcoding.cs
+++ b/src/DotNetty.Common/Internal/Utf8Utility.Transcoding.NetCore3.cs
@@ -4,7 +4,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
-#if NETCOREAPP_3_0_GREATER
+#if NETCOREAPP3_1
 using System;
 using System.Buffers;
 using System.Buffers.Binary;
@@ -15,7 +15,7 @@
 
 namespace DotNetty.Common.Internal
 {
-    internal static unsafe partial class Utf8Utility
+    unsafe partial class Utf8Utility
     {
         // On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where
         // the next byte would have been consumed from / the next char would have been written to.
@@ -86,7 +86,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                         goto ProcessRemainingBytesSlow; // running out of space, but may be able to write some data
                     }
 
-                    Utf8Utility.Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
+                    Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
                     pInputBuffer += 4;
                     pOutputBuffer += 4;
                     outputCharsRemaining -= 4;
@@ -112,8 +112,8 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
 
                         pInputBuffer += 8;
 
-                        Utf8Utility.Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[0], thisDWord);
-                        Utf8Utility.Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[4], secondDWord);
+                        Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[0], thisDWord);
+                        Widen4AsciiBytesToCharsAndWrite(ref pOutputBuffer[4], secondDWord);
 
                         pOutputBuffer += 8;
                     }
@@ -128,7 +128,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                     {
                         // The first DWORD contained all-ASCII bytes, so expand it.
 
-                        Utf8Utility.Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
+                        Widen4AsciiBytesToCharsAndWrite(ref *pOutputBuffer, thisDWord);
 
                         // continue the outer loop from the second DWORD
 
@@ -155,25 +155,25 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                 // Next, try stripping off ASCII bytes one at a time.
                 // We only handle up to three ASCII bytes here since we handled the four ASCII byte case above.
 
-                if (Utf8Utility.UInt32FirstByteIsAscii(thisDWord))
+                if (UInt32FirstByteIsAscii(thisDWord))
                 {
                     if (outputCharsRemaining >= 3)
                     {
                         // Fast-track: we don't need to check the destination length for subsequent
                         // ASCII bytes since we know we can write them all now.
 
-                        uint thisDWordLittleEndian = Utf8Utility.ToLittleEndian(thisDWord);
+                        uint thisDWordLittleEndian = ToLittleEndian(thisDWord);
 
                         nuint adjustment = 1;
                         pOutputBuffer[0] = (char)(byte)thisDWordLittleEndian;
 
-                        if (Utf8Utility.UInt32SecondByteIsAscii(thisDWord))
+                        if (UInt32SecondByteIsAscii(thisDWord))
                         {
                             adjustment++;
                             thisDWordLittleEndian >>= 8;
                             pOutputBuffer[1] = (char)(byte)thisDWordLittleEndian;
 
-                            if (Utf8Utility.UInt32ThirdByteIsAscii(thisDWord))
+                            if (UInt32ThirdByteIsAscii(thisDWord))
                             {
                                 adjustment++;
                                 thisDWordLittleEndian >>= 8;
@@ -195,13 +195,13 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                             goto OutputBufferTooSmall;
                         }
 
-                        uint thisDWordLittleEndian = Utf8Utility.ToLittleEndian(thisDWord);
+                        uint thisDWordLittleEndian = ToLittleEndian(thisDWord);
 
                         pInputBuffer++;
                         *pOutputBuffer++ = (char)(byte)thisDWordLittleEndian;
                         outputCharsRemaining--;
 
-                        if (Utf8Utility.UInt32SecondByteIsAscii(thisDWord))
+                        if (UInt32SecondByteIsAscii(thisDWord))
                         {
                             if (0u >= (uint)outputCharsRemaining)
                             {
@@ -224,7 +224,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
 
                             Debug.Assert(outputCharsRemaining == 1);
 
-                            if (Utf8Utility.UInt32ThirdByteIsAscii(thisDWord))
+                            if (UInt32ThirdByteIsAscii(thisDWord))
                             {
                                 goto OutputBufferTooSmall;
                             }
@@ -257,12 +257,12 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
 
                 // Check the 2-byte case.
 
-                if (Utf8Utility.UInt32BeginsWithUtf8TwoByteMask(thisDWord))
+                if (UInt32BeginsWithUtf8TwoByteMask(thisDWord))
                 {
                     // Per Table 3-7, valid sequences are:
                     // [ C2..DF ] [ 80..BF ]
 
-                    if (Utf8Utility.UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
+                    if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
                     {
                         goto Error;
                     }
@@ -277,8 +277,8 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                     // the value isn't overlong using a single comparison. On big-endian platforms, we'll need
                     // to validate the mask and validate that the sequence isn't overlong as two separate comparisons.
 
-                    if ((BitConverter.IsLittleEndian && Utf8Utility.UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
-                        || (!BitConverter.IsLittleEndian && (Utf8Utility.UInt32EndsWithUtf8TwoByteMask(thisDWord) && !Utf8Utility.UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord))))
+                    if ((BitConverter.IsLittleEndian && UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
+                        || (!BitConverter.IsLittleEndian && (UInt32EndsWithUtf8TwoByteMask(thisDWord) && !UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord))))
                     {
                         // We have two runs of two bytes each.
 
@@ -287,7 +287,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                             goto ProcessRemainingBytesSlow; // running out of output buffer
                         }
 
-                        Unsafe.WriteUnaligned<uint>(pOutputBuffer, Utf8Utility.ExtractTwoCharsPackedFromTwoAdjacentTwoByteSequences(thisDWord));
+                        Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractTwoCharsPackedFromTwoAdjacentTwoByteSequences(thisDWord));
 
                         pInputBuffer += 4;
                         pOutputBuffer += 2;
@@ -302,7 +302,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
 
                             if (BitConverter.IsLittleEndian)
                             {
-                                if (Utf8Utility.UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
+                                if (UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
                                 {
                                     // The next sequence is a valid two-byte sequence.
                                     goto ProcessTwoByteSequenceSkipOverlongFormCheck;
@@ -310,9 +310,9 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                             }
                             else
                             {
-                                if (Utf8Utility.UInt32BeginsWithUtf8TwoByteMask(thisDWord))
+                                if (UInt32BeginsWithUtf8TwoByteMask(thisDWord))
                                 {
-                                    if (Utf8Utility.UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
+                                    if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
                                     {
                                         goto Error; // The next sequence purports to be a 2-byte sequence but is overlong.
                                     }
@@ -335,11 +335,11 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                     // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
                     // bytes are ASCII?
 
-                    uint charToWrite = Utf8Utility.ExtractCharFromFirstTwoByteSequence(thisDWord); // optimistically compute this now, but don't store until we know dest is large enough
+                    uint charToWrite = ExtractCharFromFirstTwoByteSequence(thisDWord); // optimistically compute this now, but don't store until we know dest is large enough
 
-                    if (Utf8Utility.UInt32ThirdByteIsAscii(thisDWord))
+                    if (UInt32ThirdByteIsAscii(thisDWord))
                     {
-                        if (Utf8Utility.UInt32FourthByteIsAscii(thisDWord))
+                        if (UInt32FourthByteIsAscii(thisDWord))
                         {
                             if (outputCharsRemaining < 3)
                             {
@@ -420,7 +420,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
 
             BeforeProcessThreeByteSequence:
 
-                if (Utf8Utility.UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
+                if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
                 {
                 ProcessThreeByteSequenceWithCheck:
 
@@ -498,7 +498,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
 
                                 uint secondDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer + 3);
 
-                                if (Utf8Utility.UInt32BeginsWithUtf8ThreeByteMask(secondDWord)
+                                if (UInt32BeginsWithUtf8ThreeByteMask(secondDWord)
                                     && ((secondDWord & 0x0000_200Fu) != 0)
                                     && (((secondDWord - 0x0000_200Du) & 0x0000_200Fu) != 0))
                                 {
@@ -524,7 +524,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
 
                     // Couldn't extract 2x three-byte sequences together, just do this one by itself.
 
-                    *pOutputBuffer = (char)Utf8Utility.ExtractCharFromFirstThreeByteSequence(thisDWord);
+                    *pOutputBuffer = (char)ExtractCharFromFirstThreeByteSequence(thisDWord);
                     pInputBuffer += 3;
                     pOutputBuffer += 1;
                     outputCharsRemaining -= 1;
@@ -535,7 +535,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                     // in to the text. If this happens strip it off now before seeing if the next character
                     // consists of three code units.
 
-                    if (Utf8Utility.UInt32FourthByteIsAscii(thisDWord))
+                    if (UInt32FourthByteIsAscii(thisDWord))
                     {
                         if (0u >= (uint)outputCharsRemaining)
                         {
@@ -565,7 +565,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                         // marker now and jump directly to three-byte sequence processing if we see one, skipping
                         // all of the logic at the beginning of the loop.
 
-                        if (Utf8Utility.UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
+                        if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
                         {
                             goto ProcessThreeByteSequenceWithCheck; // found a three-byte sequence marker; validate and consume
                         }
@@ -590,7 +590,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                     // [ F1..F3 ] [ 80..BF ] [ 80..BF ] [ 80..BF ]
                     // [   F4   ] [ 80..8F ] [ 80..BF ] [ 80..BF ]
 
-                    if (!Utf8Utility.UInt32BeginsWithUtf8FourByteMask(thisDWord))
+                    if (!UInt32BeginsWithUtf8FourByteMask(thisDWord))
                     {
                         goto Error;
                     }
@@ -635,7 +635,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                         goto OutputBufferTooSmall;
                     }
 
-                    Unsafe.WriteUnaligned<uint>(pOutputBuffer, Utf8Utility.ExtractCharsFromFourByteSequence(thisDWord));
+                    Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractCharsFromFourByteSequence(thisDWord));
 
                     pInputBuffer += 4;
                     pOutputBuffer += 2;
@@ -681,7 +681,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                     }
 
                     uint secondByte = pInputBuffer[1];
-                    if (!Utf8Utility.IsLowByteUtf8ContinuationByte(secondByte))
+                    if (!IsLowByteUtf8ContinuationByte(secondByte))
                     {
                         goto Error; // 2-byte marker not followed by continuation byte
                     }
@@ -707,7 +707,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                     {
                         uint secondByte = pInputBuffer[1];
                         uint thirdByte = pInputBuffer[2];
-                        if (!Utf8Utility.IsLowByteUtf8ContinuationByte(secondByte) || !Utf8Utility.IsLowByteUtf8ContinuationByte(thirdByte))
+                        if (!IsLowByteUtf8ContinuationByte(secondByte) || !IsLowByteUtf8ContinuationByte(thirdByte))
                         {
                             goto Error; // 3-byte marker not followed by 2 continuation bytes
                         }
@@ -749,7 +749,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                     else if (inputLength >= 2)
                     {
                         uint secondByte = pInputBuffer[1];
-                        if (!Utf8Utility.IsLowByteUtf8ContinuationByte(secondByte))
+                        if (!IsLowByteUtf8ContinuationByte(secondByte))
                         {
                             goto Error; // 3-byte marker not followed by continuation byte
                         }
@@ -780,7 +780,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                     }
 
                     uint nextByte = pInputBuffer[1];
-                    if (!Utf8Utility.IsLowByteUtf8ContinuationByte(nextByte))
+                    if (!IsLowByteUtf8ContinuationByte(nextByte))
                     {
                         goto Error; // 4-byte marker not followed by a continuation byte
                     }
@@ -796,7 +796,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                         goto InputBufferTooSmall; // ran out of data
                     }
 
-                    if (!Utf8Utility.IsLowByteUtf8ContinuationByte(pInputBuffer[2]))
+                    if (!IsLowByteUtf8ContinuationByte(pInputBuffer[2]))
                     {
                         goto Error; // third byte in 4-byte sequence not a continuation byte
                     }
@@ -806,7 +806,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng
                         goto InputBufferTooSmall; // ran out of data
                     }
 
-                    if (!Utf8Utility.IsLowByteUtf8ContinuationByte(pInputBuffer[3]))
+                    if (!IsLowByteUtf8ContinuationByte(pInputBuffer[3]))
                     {
                         goto Error; // fourth byte in 4-byte sequence not a continuation byte
                     }
@@ -1077,7 +1077,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
                 // Next, try stripping off the first ASCII char if it exists.
                 // We don't check for a second ASCII char since that should have been handled above.
 
-                if (Utf8Utility.IsFirstCharAscii(thisDWord))
+                if (IsFirstCharAscii(thisDWord))
                 {
                     if (0u >= (uint)outputBytesRemaining)
                     {
@@ -1111,14 +1111,14 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
 
                 // At this point, we know the first char in the buffer is non-ASCII, but we haven't yet validated it.
 
-                if (!Utf8Utility.IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
+                if (!IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
                 {
                 TryConsumeMultipleTwoByteSequences:
 
                     // For certain text (Greek, Cyrillic, ...), 2-byte sequences tend to be clustered. We'll try transcoding them in
                     // a tight loop without falling back to the main loop.
 
-                    if (Utf8Utility.IsSecondCharTwoUtf8Bytes(thisDWord))
+                    if (IsSecondCharTwoUtf8Bytes(thisDWord))
                     {
                         // We have two runs of two bytes each.
 
@@ -1127,7 +1127,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
                             goto ProcessOneCharFromCurrentDWordAndFinish; // running out of output buffer
                         }
 
-                        Unsafe.WriteUnaligned<uint>(pOutputBuffer, Utf8Utility.ExtractTwoUtf8TwoByteSequencesFromTwoPackedUtf16Chars(thisDWord));
+                        Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractTwoUtf8TwoByteSequencesFromTwoPackedUtf16Chars(thisDWord));
 
                         pInputBuffer += 2;
                         pOutputBuffer += 4;
@@ -1144,7 +1144,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
 
                             thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
 
-                            if (Utf8Utility.IsFirstCharTwoUtf8Bytes(thisDWord))
+                            if (IsFirstCharTwoUtf8Bytes(thisDWord))
                             {
                                 // Validated we have a two-byte sequence coming up
                                 goto TryConsumeMultipleTwoByteSequences;
@@ -1161,13 +1161,13 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
                         goto OutputBufferTooSmall;
                     }
 
-                    Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)Utf8Utility.ExtractUtf8TwoByteSequenceFromFirstUtf16Char(thisDWord));
+                    Unsafe.WriteUnaligned<ushort>(pOutputBuffer, (ushort)ExtractUtf8TwoByteSequenceFromFirstUtf16Char(thisDWord));
 
                     // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence.
                     // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
                     // char is ASCII?
 
-                    if (Utf8Utility.IsSecondCharAscii(thisDWord))
+                    if (IsSecondCharAscii(thisDWord))
                     {
                         if (outputBytesRemaining >= 3)
                         {
@@ -1212,22 +1212,22 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
 
             BeforeProcessThreeByteSequence:
 
-                if (!Utf8Utility.IsFirstCharSurrogate(thisDWord))
+                if (!IsFirstCharSurrogate(thisDWord))
                 {
                     // Optimization: A three-byte character could indicate CJK text, which makes it likely
                     // that the character following this one is also CJK. We'll perform the check now
                     // rather than jumping to the beginning of the main loop.
 
-                    if (Utf8Utility.IsSecondCharAtLeastThreeUtf8Bytes(thisDWord))
+                    if (IsSecondCharAtLeastThreeUtf8Bytes(thisDWord))
                     {
-                        if (!Utf8Utility.IsSecondCharSurrogate(thisDWord))
+                        if (!IsSecondCharSurrogate(thisDWord))
                         {
                             if (outputBytesRemaining < 6)
                             {
                                 goto ConsumeSingleThreeByteRun; // not enough space - try consuming as much as we can
                             }
 
-                            Utf8Utility.WriteTwoUtf16CharsAsTwoUtf8ThreeByteSequences(ref *pOutputBuffer, thisDWord);
+                            WriteTwoUtf16CharsAsTwoUtf8ThreeByteSequences(ref *pOutputBuffer, thisDWord);
 
                             pInputBuffer += 2;
                             pOutputBuffer += 6;
@@ -1243,7 +1243,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
                             {
                                 thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
 
-                                if (Utf8Utility.IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
+                                if (IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
                                 {
                                     goto BeforeProcessThreeByteSequence;
                                 }
@@ -1263,7 +1263,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
                         goto OutputBufferTooSmall;
                     }
 
-                    Utf8Utility.WriteFirstUtf16CharAsUtf8ThreeByteSequence(ref *pOutputBuffer, thisDWord);
+                    WriteFirstUtf16CharAsUtf8ThreeByteSequence(ref *pOutputBuffer, thisDWord);
 
                     pInputBuffer += 1;
                     pOutputBuffer += 3;
@@ -1273,7 +1273,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
                     // in to the text. If this happens strip it off now before seeing if the next character
                     // consists of three code units.
 
-                    if (Utf8Utility.IsSecondCharAscii(thisDWord))
+                    if (IsSecondCharAscii(thisDWord))
                     {
                         if (0u >= (uint)outputBytesRemaining)
                         {
@@ -1301,7 +1301,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
                         {
                             thisDWord = Unsafe.ReadUnaligned<uint>(pInputBuffer);
 
-                            if (Utf8Utility.IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
+                            if (IsFirstCharAtLeastThreeUtf8Bytes(thisDWord))
                             {
                                 goto BeforeProcessThreeByteSequence;
                             }
@@ -1326,14 +1326,14 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt
 
                 // Four byte sequence processing
 
-                if (Utf8Utility.IsWellFormedUtf16SurrogatePair(thisDWord))
+                if (IsWellFormedUtf16SurrogatePair(thisDWord))
                 {
                     if (outputBytesRemaining < 4)
                     {
                         goto OutputBufferTooSmall;
                     }
 
-                    Unsafe.WriteUnaligned<uint>(pOutputBuffer, Utf8Utility.ExtractFourUtf8BytesFromSurrogatePair(thisDWord));
+                    Unsafe.WriteUnaligned<uint>(pOutputBuffer, ExtractFourUtf8BytesFromSurrogatePair(thisDWord));
 
                     pInputBuffer += 2;
                     pOutputBuffer += 4;
diff --git a/src/DotNetty.Common/Internal/Utf8Utility.Validation.Net.cs b/src/DotNetty.Common/Internal/Utf8Utility.Validation.Net.cs
new file mode 100644
index 000000000..299c8c759
--- /dev/null
+++ b/src/DotNetty.Common/Internal/Utf8Utility.Validation.Net.cs
@@ -0,0 +1,32 @@
+﻿// borrowed from https://github.com/dotnet/corefx/tree/release/3.1/src/Common/src/CoreLib/System/Text/Unicode
+
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#if NET
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+
+namespace DotNetty.Common.Internal
+{
+    partial class Utf8Utility
+    {
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bitMask128)
+        {
+            if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian)
+            {
+                throw ThrowHelper.GetNotSupportedException(); ;
+            }
+
+            Vector128<byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte();
+            Vector128<byte> extractedBits = AdvSimd.And(mostSignificantBitIsSet, bitMask128);
+            extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
+            return extractedBits.AsUInt64().ToScalar();
+        }
+    }
+}
+#endif
diff --git a/src/DotNetty.Common/Internal/Utf8Utility.Validation.cs b/src/DotNetty.Common/Internal/Utf8Utility.Validation.cs
index 33b0a4e86..2141f1f2a 100644
--- a/src/DotNetty.Common/Internal/Utf8Utility.Validation.cs
+++ b/src/DotNetty.Common/Internal/Utf8Utility.Validation.cs
@@ -10,10 +10,14 @@
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics.X86;
+#if NET
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+#endif
 
 namespace DotNetty.Common.Internal
 {
-    internal static unsafe partial class Utf8Utility
+    unsafe partial class Utf8Utility
     {
         // Returns &inputBuffer[inputLength] if the input buffer is valid.
         /// <summary>
@@ -118,6 +122,7 @@ internal static unsafe partial class Utf8Utility
                         // the alignment check consumes at most a single DWORD.)
 
                         byte* pInputBufferFinalPosAtWhichCanSafelyLoop = pFinalPosWhereCanReadDWordFromInputBuffer - 3 * sizeof(uint); // can safely read 4 DWORDs here
+#if NETCOREAPP3_1
                         uint mask;
 
                         do
@@ -136,6 +141,39 @@ internal static unsafe partial class Utf8Utility
                                     goto Sse2LoopTerminatedEarlyDueToNonAsciiData;
                                 }
                             }
+#else
+                        nuint trailingZeroCount;
+
+                        Vector128<byte> bitMask128 = BitConverter.IsLittleEndian ?
+                            Vector128.Create((ushort)0x1001).AsByte() :
+                            Vector128.Create((ushort)0x0110).AsByte();
+
+                        do
+                        {
+                            // pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're
+                            // going to perform an unaligned load. We don't necessarily care about aligning
+                            // this because we pessimistically assume we'll encounter non-ASCII data at some
+                            // point in the not-too-distant future (otherwise we would've stayed entirely
+                            // within the all-ASCII vectorized code at the entry to this method).
+                            if (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)
+                            {
+                                ulong mask = GetNonAsciiBytes(AdvSimd.LoadVector128(pInputBuffer), bitMask128);
+                                if (mask != 0)
+                                {
+                                    trailingZeroCount = (nuint)BitOperations.TrailingZeroCount(mask) >> 2;
+                                    goto LoopTerminatedEarlyDueToNonAsciiData;
+                                }
+                            }
+                            else if (Sse2.IsSupported)
+                            {
+                                uint mask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pInputBuffer));
+                                if (mask != 0)
+                                {
+                                    trailingZeroCount = (nuint)BitOperations.TrailingZeroCount(mask);
+                                    goto LoopTerminatedEarlyDueToNonAsciiData;
+                                }
+                            }
+#endif
                             else
                             {
                                 if (!ASCIIUtility.AllBytesInUInt32AreAscii(((uint*)pInputBuffer)[0] | ((uint*)pInputBuffer)[1]))
@@ -154,6 +192,7 @@ internal static unsafe partial class Utf8Utility
 
                         continue; // need to perform a bounds check because we might be running out of data
 
+#if NETCOREAPP3_1
                     Sse2LoopTerminatedEarlyDueToNonAsciiData:
 
                         Debug.Assert(BitConverter.IsLittleEndian);
@@ -168,6 +207,22 @@ internal static unsafe partial class Utf8Utility
                         Debug.Assert(mask != 0);
 
                         pInputBuffer += Bmi1.TrailingZeroCount(mask);
+#else
+                    LoopTerminatedEarlyDueToNonAsciiData:
+                        // x86 can only be little endian, while ARM can be big or little endian
+                        // so if we reached this label we need to check both combinations are supported
+                        Debug.Assert((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported);
+
+
+                        // The 'mask' value will have a 0 bit for each ASCII byte we saw and a 1 bit
+                        // for each non-ASCII byte we saw. trailingZeroCount will count the number of ASCII bytes,
+                        // bump our input counter by that amount, and resume processing from the
+                        // "the first byte is no longer ASCII" portion of the main loop.
+                        // We should not expect a total number of zeroes equal or larger than 16.
+                        Debug.Assert(trailingZeroCount < 16);
+
+                        pInputBuffer += trailingZeroCount;
+#endif
                         if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
                         {
                             goto ProcessRemainingBytesSlow;
@@ -261,8 +316,8 @@ internal static unsafe partial class Utf8Utility
                     // the value isn't overlong using a single comparison. On big-endian platforms, we'll need
                     // to validate the mask and validate that the sequence isn't overlong as two separate comparisons.
 
-                    if ((BitConverter.IsLittleEndian && Utf8Utility.UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
-                        || (!BitConverter.IsLittleEndian && (Utf8Utility.UInt32EndsWithUtf8TwoByteMask(thisDWord) && !Utf8Utility.UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord))))
+                    if ((BitConverter.IsLittleEndian && UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
+                        || (!BitConverter.IsLittleEndian && (UInt32EndsWithUtf8TwoByteMask(thisDWord) && !UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord))))
                     {
                         // We have two runs of two bytes each.
                         pInputBuffer += 4;
@@ -277,7 +332,7 @@ internal static unsafe partial class Utf8Utility
 
                             if (BitConverter.IsLittleEndian)
                             {
-                                if (Utf8Utility.UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
+                                if (UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
                                 {
                                     // The next sequence is a valid two-byte sequence.
                                     goto ProcessTwoByteSequenceSkipOverlongFormCheck;
@@ -285,9 +340,9 @@ internal static unsafe partial class Utf8Utility
                             }
                             else
                             {
-                                if (Utf8Utility.UInt32BeginsWithUtf8TwoByteMask(thisDWord))
+                                if (UInt32BeginsWithUtf8TwoByteMask(thisDWord))
                                 {
-                                    if (Utf8Utility.UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
+                                    if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
                                     {
                                         goto Error; // The next sequence purports to be a 2-byte sequence but is overlong.
                                     }
@@ -312,9 +367,9 @@ internal static unsafe partial class Utf8Utility
 
                     tempUtf16CodeUnitCountAdjustment--; // 2-byte sequence + (some number of ASCII bytes) -> 1 UTF-16 code units (and 1 scalar) [+ trailing]
 
-                    if (Utf8Utility.UInt32ThirdByteIsAscii(thisDWord))
+                    if (UInt32ThirdByteIsAscii(thisDWord))
                     {
-                        if (Utf8Utility.UInt32FourthByteIsAscii(thisDWord))
+                        if (UInt32FourthByteIsAscii(thisDWord))
                         {
                             pInputBuffer += 4;
                         }
@@ -449,7 +504,7 @@ internal static unsafe partial class Utf8Utility
                             // Is this three 3-byte sequences in a row?
                             // thisQWord = [ 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz ] [ 10xxxxxx ]
                             //               ---- CHAR 3  ----   --------- CHAR 2 ---------   --------- CHAR 1 ---------     -CHAR 3-
-                            if ((thisQWord & 0xC0F0_C0C0_F0C0_C0F0ul) == 0x80E0_8080_E080_80E0ul && Utf8Utility.IsUtf8ContinuationByte(in pInputBuffer[8]))
+                            if ((thisQWord & 0xC0F0_C0C0_F0C0_C0F0ul) == 0x80E0_8080_E080_80E0ul && IsUtf8ContinuationByte(in pInputBuffer[8]))
                             {
                                 // Saw a proper bitmask for three incoming 3-byte sequences, perform the
                                 // overlong and surrogate sequence checking now.
@@ -523,7 +578,7 @@ internal static unsafe partial class Utf8Utility
                                 continue;
                             }
 
-                            if (Utf8Utility.UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
+                            if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
                             {
                                 // A single three-byte sequence.
                                 goto ProcessThreeByteSequenceWithCheck;
@@ -545,7 +600,7 @@ internal static unsafe partial class Utf8Utility
                         // marker now and jump directly to three-byte sequence processing if we see one, skipping
                         // all of the logic at the beginning of the loop.
 
-                        if (Utf8Utility.UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
+                        if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
                         {
                             goto ProcessThreeByteSequenceWithCheck; // Found another [not yet validated] three-byte sequence; process
                         }
@@ -655,7 +710,7 @@ internal static unsafe partial class Utf8Utility
                     if ((byte)firstByte < 0xE0u)
                     {
                         // 2-byte case
-                        if ((byte)firstByte >= 0xC2u && Utf8Utility.IsLowByteUtf8ContinuationByte(secondByte))
+                        if ((byte)firstByte >= 0xC2u && IsLowByteUtf8ContinuationByte(secondByte))
                         {
                             pInputBuffer += 2;
                             tempUtf16CodeUnitCountAdjustment--; // 2 UTF-8 bytes -> 1 UTF-16 code unit (and 1 scalar)
@@ -683,13 +738,13 @@ internal static unsafe partial class Utf8Utility
                             }
                             else
                             {
-                                if (!Utf8Utility.IsLowByteUtf8ContinuationByte(secondByte))
+                                if (!IsLowByteUtf8ContinuationByte(secondByte))
                                 {
                                     goto Error; // first trailing byte doesn't have proper continuation marker
                                 }
                             }
 
-                            if (Utf8Utility.IsUtf8ContinuationByte(in pInputBuffer[2]))
+                            if (IsUtf8ContinuationByte(in pInputBuffer[2]))
                             {
                                 pInputBuffer += 3;
                                 tempUtf16CodeUnitCountAdjustment -= 2; // 3 UTF-8 bytes -> 2 UTF-16 code units (and 2 scalars)
diff --git a/src/DotNetty.Common/Internal/Utf8Utility.WhiteSpace.cs b/src/DotNetty.Common/Internal/Utf8Utility.WhiteSpace.cs
new file mode 100644
index 000000000..5030c5c37
--- /dev/null
+++ b/src/DotNetty.Common/Internal/Utf8Utility.WhiteSpace.cs
@@ -0,0 +1,132 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+#if NETCOREAPP_3_0_GREATER
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Text;
+
+namespace DotNetty.Common.Internal
+{
+    partial class Utf8Utility
+    {
+        /// <summary>
+        /// Returns the index in <paramref name="utf8Data"/> where the first non-whitespace character
+        /// appears, or the input length if the data contains only whitespace characters.
+        /// </summary>
+        public static int GetIndexOfFirstNonWhiteSpaceChar(ReadOnlySpan<byte> utf8Data)
+        {
+            return (int)GetIndexOfFirstNonWhiteSpaceChar(ref MemoryMarshal.GetReference(utf8Data), utf8Data.Length);
+        }
+
+        internal static nint GetIndexOfFirstNonWhiteSpaceChar(ref byte utf8Data, nint length)
+        {
+            // This method is optimized for the case where the input data is ASCII, and if the
+            // data does need to be trimmed it's likely that only a relatively small number of
+            // bytes will be trimmed.
+
+            nint i = 0;
+
+            while (i < length)
+            {
+                // Very quick check: see if the byte is in the range [ 21 .. 7F ].
+                // If so, we can skip the more expensive logic later in this method.
+
+                if ((sbyte)Unsafe.AddByteOffset(ref utf8Data, i) > (sbyte)0x20)
+                {
+                    break;
+                }
+
+                uint possibleAsciiByte = Unsafe.AddByteOffset(ref utf8Data, i);
+                if (UnicodeUtility.IsAsciiCodePoint(possibleAsciiByte))
+                {
+                    // The simple comparison failed. Let's read the actual byte value,
+                    // and if it's ASCII we can delegate to Rune's inlined method
+                    // implementation.
+
+                    if (Rune.IsWhiteSpace(new Rune(possibleAsciiByte)))
+                    {
+                        i++;
+                        continue;
+                    }
+                }
+                else
+                {
+                    // Not ASCII data. Go back to the slower "decode the entire scalar"
+                    // code path, then compare it against our Unicode tables.
+
+                    Rune.DecodeFromUtf8(MemoryMarshal.CreateReadOnlySpan(ref utf8Data, (int)length).Slice((int)i), out Rune decodedRune, out int bytesConsumed);
+                    if (Rune.IsWhiteSpace(decodedRune))
+                    {
+                        i += bytesConsumed;
+                        continue;
+                    }
+                }
+
+                break; // If we got here, we saw a non-whitespace subsequence.
+            }
+
+            return i;
+        }
+
+        /// <summary>
+        /// Returns the index in <paramref name="utf8Data"/> where the trailing whitespace sequence
+        /// begins, or 0 if the data contains only whitespace characters, or the span length if the
+        /// data does not end with any whitespace characters.
+        /// </summary>
+        public static int GetIndexOfTrailingWhiteSpaceSequence(ReadOnlySpan<byte> utf8Data)
+        {
+            return (int)GetIndexOfTrailingWhiteSpaceSequence(ref MemoryMarshal.GetReference(utf8Data), utf8Data.Length);
+        }
+
+        internal static nint GetIndexOfTrailingWhiteSpaceSequence(ref byte utf8Data, nint length)
+        {
+            // This method is optimized for the case where the input data is ASCII, and if the
+            // data does need to be trimmed it's likely that only a relatively small number of
+            // bytes will be trimmed.
+
+            while (length > 0)
+            {
+                // Very quick check: see if the byte is in the range [ 21 .. 7F ].
+                // If so, we can skip the more expensive logic later in this method.
+
+                if ((sbyte)Unsafe.Add(ref Unsafe.AddByteOffset(ref utf8Data, length), -1) > (sbyte)0x20)
+                {
+                    break;
+                }
+
+                uint possibleAsciiByte = Unsafe.Add(ref Unsafe.AddByteOffset(ref utf8Data, length), -1);
+                if (UnicodeUtility.IsAsciiCodePoint(possibleAsciiByte))
+                {
+                    // The simple comparison failed. Let's read the actual byte value,
+                    // and if it's ASCII we can delegate to Rune's inlined method
+                    // implementation.
+
+                    if (Rune.IsWhiteSpace(new Rune(possibleAsciiByte)))
+                    {
+                        length--;
+                        continue;
+                    }
+                }
+                else
+                {
+                    // Not ASCII data. Go back to the slower "decode the entire scalar"
+                    // code path, then compare it against our Unicode tables.
+
+                    Rune.DecodeLastFromUtf8(MemoryMarshal.CreateReadOnlySpan(ref utf8Data, (int)length), out Rune decodedRune, out int bytesConsumed);
+                    if (Rune.IsWhiteSpace(decodedRune))
+                    {
+                        length -= bytesConsumed;
+                        continue;
+                    }
+                }
+
+                break; // If we got here, we saw a non-whitespace subsequence.
+            }
+
+            return length;
+        }
+    }
+}
+#endif
\ No newline at end of file
diff --git a/src/DotNetty.Common/Internal/Utf8Utility.cs b/src/DotNetty.Common/Internal/Utf8Utility.cs
index e7febc58b..d82b34e0f 100644
--- a/src/DotNetty.Common/Internal/Utf8Utility.cs
+++ b/src/DotNetty.Common/Internal/Utf8Utility.cs
@@ -8,12 +8,6 @@
 using System;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
-#if FEATURE_UTF8STRING
-using System.Buffers;
-using System.Diagnostics;
-using System.Diagnostics.CodeAnalysis;
-using System.IO;
-#endif
 
 namespace DotNetty.Common.Internal
 {
@@ -48,59 +42,6 @@ public unsafe static int GetIndexOfFirstInvalidUtf8Sequence(in ReadOnlySpan<byte
                 return ((uint)index < (uint)utf8Data.Length) ? index : -1;
             }
         }
-
-#if FEATURE_UTF8STRING
-        /// <summary>
-        /// Returns <paramref name="value"/> if it is null or contains only well-formed UTF-8 data;
-        /// otherwises allocates a new <see cref="Utf8String"/> instance containing the same data as
-        /// <paramref name="value"/> but where all invalid UTF-8 sequences have been replaced
-        /// with U+FFD.
-        /// </summary>
-        [return: NotNullIfNotNull("value")]
-        public static Utf8String? ValidateAndFixupUtf8String(Utf8String? value)
-        {
-            if (Utf8String.IsNullOrEmpty(value))
-            {
-                return value;
-            }
-
-            ReadOnlySpan<byte> valueAsBytes = value.AsBytes();
-
-            int idxOfFirstInvalidData = GetIndexOfFirstInvalidUtf8Sequence(valueAsBytes, out _);
-            if (idxOfFirstInvalidData < 0)
-            {
-                return value;
-            }
-
-            // TODO_UTF8STRING: Replace this with the faster implementation once it's available.
-            // (The faster implementation is in the dev/utf8string_bak branch currently.)
-
-            MemoryStream memStream = new MemoryStream();
-            memStream.Write(valueAsBytes.Slice(0, idxOfFirstInvalidData));
-
-            valueAsBytes = valueAsBytes.Slice(idxOfFirstInvalidData);
-            do
-            {
-                if (Rune.DecodeFromUtf8(valueAsBytes, out _, out int bytesConsumed) == OperationStatus.Done)
-                {
-                    //  Valid scalar value - copy data as-is to MemoryStream
-                    memStream.Write(valueAsBytes.Slice(0, bytesConsumed));
-                }
-                else
-                {
-                    // Invalid scalar value - copy U+FFFD to MemoryStream
-                    memStream.Write(ReplacementCharSequence);
-                }
-
-                valueAsBytes = valueAsBytes.Slice(bytesConsumed);
-            } while (!valueAsBytes.IsEmpty);
-
-            bool success = memStream.TryGetBuffer(out ArraySegment<byte> memStreamBuffer);
-            Debug.Assert(success, "Couldn't get underlying MemoryStream buffer.");
-
-            return Utf8String.DangerousCreateWithoutValidation(memStreamBuffer, assumeWellFormed: true);
-        }
-#endif // FEATURE_UTF8STRING
     }
 }
 #endif
diff --git a/test/DotNetty.Common.Tests/DotNetty.Common.Tests.csproj b/test/DotNetty.Common.Tests/DotNetty.Common.Tests.csproj
index f28d8af8f..5441058a2 100644
--- a/test/DotNetty.Common.Tests/DotNetty.Common.Tests.csproj
+++ b/test/DotNetty.Common.Tests/DotNetty.Common.Tests.csproj
@@ -4,7 +4,10 @@
     <TargetFrameworks>$(StandardTestTfms)</TargetFrameworks>
     <RootNamespace>DotNetty.Common.Tests</RootNamespace>
     <AssemblyName>DotNetty.Common.Tests</AssemblyName>
-    <AllowUnsafeBlocks>false</AllowUnsafeBlocks>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+  </PropertyGroup>
+  <PropertyGroup  Condition=" '$(TargetFramework)' == 'net5.0' or '$(TargetFramework)' == 'netcoreapp3.1' ">
+    <DefineConstants>$(DefineConstants);CORELIBTEST</DefineConstants>
   </PropertyGroup>
   <PropertyGroup Condition=" '$(ImportLibs)' == 'netfx' ">
     <RuntimeIdentifier>win-x64</RuntimeIdentifier>
diff --git a/test/DotNetty.Common.Tests/Internal/CoreLib/ASCIIUtilityTests.cs b/test/DotNetty.Common.Tests/Internal/CoreLib/ASCIIUtilityTests.cs
new file mode 100644
index 000000000..2587a2169
--- /dev/null
+++ b/test/DotNetty.Common.Tests/Internal/CoreLib/ASCIIUtilityTests.cs
@@ -0,0 +1,419 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#if CORELIBTEST
+using System;
+using System.Buffers;
+using System.Numerics;
+using System.Runtime.InteropServices;
+using System.Security.Cryptography;
+using DotNetty.Common.Internal;
+using Xunit;
+
+namespace DotNetty.Common.Tests.Internal.CoreLib
+{
+    // Since many of the methods we'll be testing are internal, we'll need to invoke
+    // them via reflection.
+    public static unsafe class AsciiUtilityTests
+    {
+        private const int SizeOfVector128 = 128 / 8;
+
+        [Fact]
+        public static void GetIndexOfFirstNonAsciiByte_EmptyInput_NullReference()
+        {
+            Assert.Equal(UIntPtr.Zero, (UIntPtr)ASCIIUtility.GetIndexOfFirstNonAsciiByte(null, UIntPtr.Zero));
+        }
+
+        [Fact]
+        public static void GetIndexOfFirstNonAsciiByte_EmptyInput_NonNullReference()
+        {
+            byte b = default;
+            Assert.Equal(UIntPtr.Zero, (UIntPtr)ASCIIUtility.GetIndexOfFirstNonAsciiByte(&b, UIntPtr.Zero));
+        }
+
+        [Fact]
+        public static void GetIndexOfFirstNonAsciiByte_Vector128InnerLoop()
+        {
+            // The purpose of this test is to make sure we're identifying the correct
+            // vector (of the two that we're reading simultaneously) when performing
+            // the final ASCII drain at the end of the method once we've broken out
+            // of the inner loop.
+
+            using (BoundedMemory<byte> mem = BoundedMemory.Allocate<byte>(1024))
+            {
+                Span<byte> bytes = mem.Span;
+
+                for (int i = 0; i < bytes.Length; i++)
+                {
+                    bytes[i] &= 0x7F; // make sure each byte (of the pre-populated random data) is ASCII
+                }
+
+                // Two vectors have offsets 0 .. 31. We'll go backward to avoid having to
+                // re-clear the vector every time.
+
+                for (int i = 2 * SizeOfVector128 - 1; i >= 0; i--)
+                {
+                    bytes[100 + i * 13] = 0x80; // 13 is relatively prime to 32, so it ensures all possible positions are hit
+                    Assert.Equal(100 + i * 13, CallGetIndexOfFirstNonAsciiByte(bytes));
+                }
+            }
+        }
+
+        [Fact]
+        public static void GetIndexOfFirstNonAsciiByte_Boundaries()
+        {
+            // The purpose of this test is to make sure we're hitting all of the vectorized
+            // and draining logic correctly both in the SSE2 and in the non-SSE2 enlightened
+            // code paths. We shouldn't be reading beyond the boundaries we were given.
+
+            // The 5 * Vector test should make sure that we're exercising all possible
+            // code paths across both implementations.
+            using (BoundedMemory<byte> mem = BoundedMemory.Allocate<byte>(5 * Vector<byte>.Count))
+            {
+                Span<byte> bytes = mem.Span;
+
+                // First, try it with all-ASCII buffers.
+
+                for (int i = 0; i < bytes.Length; i++)
+                {
+                    bytes[i] &= 0x7F; // make sure each byte (of the pre-populated random data) is ASCII
+                }
+
+                for (int i = bytes.Length; i >= 0; i--)
+                {
+                    Assert.Equal(i, CallGetIndexOfFirstNonAsciiByte(bytes.Slice(0, i)));
+                }
+
+                // Then, try it with non-ASCII bytes.
+
+                for (int i = bytes.Length; i >= 1; i--)
+                {
+                    bytes[i - 1] = 0x80; // set non-ASCII
+                    Assert.Equal(i - 1, CallGetIndexOfFirstNonAsciiByte(bytes.Slice(0, i)));
+                }
+            }
+        }
+
+        [Fact]
+        public static void GetIndexOfFirstNonAsciiChar_EmptyInput_NullReference()
+        {
+            Assert.Equal(UIntPtr.Zero, (UIntPtr)ASCIIUtility.GetIndexOfFirstNonAsciiChar(null, UIntPtr.Zero));
+        }
+
+        [Fact]
+        public static void GetIndexOfFirstNonAsciiChar_EmptyInput_NonNullReference()
+        {
+            char c = default;
+            Assert.Equal(UIntPtr.Zero, (UIntPtr)ASCIIUtility.GetIndexOfFirstNonAsciiChar(&c, UIntPtr.Zero));
+        }
+
+        [Fact]
+        public static void GetIndexOfFirstNonAsciiChar_Vector128InnerLoop()
+        {
+            // The purpose of this test is to make sure we're identifying the correct
+            // vector (of the two that we're reading simultaneously) when performing
+            // the final ASCII drain at the end of the method once we've broken out
+            // of the inner loop.
+            //
+            // Use U+0123 instead of U+0080 for this test because if our implementation
+            // uses pminuw / pmovmskb incorrectly, U+0123 will incorrectly show up as ASCII,
+            // causing our test to produce a false negative.
+
+            using (BoundedMemory<char> mem = BoundedMemory.Allocate<char>(1024))
+            {
+                Span<char> chars = mem.Span;
+
+                for (int i = 0; i < chars.Length; i++)
+                {
+                    chars[i] &= '\u007F'; // make sure each char (of the pre-populated random data) is ASCII
+                }
+
+                // Two vectors have offsets 0 .. 31. We'll go backward to avoid having to
+                // re-clear the vector every time.
+
+                for (int i = 2 * SizeOfVector128 - 1; i >= 0; i--)
+                {
+                    chars[100 + i * 13] = '\u0123'; // 13 is relatively prime to 32, so it ensures all possible positions are hit
+                    Assert.Equal(100 + i * 13, CallGetIndexOfFirstNonAsciiChar(chars));
+                }
+            }
+        }
+
+        [Fact]
+        public static void GetIndexOfFirstNonAsciiChar_Boundaries()
+        {
+            // The purpose of this test is to make sure we're hitting all of the vectorized
+            // and draining logic correctly both in the SSE2 and in the non-SSE2 enlightened
+            // code paths. We shouldn't be reading beyond the boundaries we were given.
+            //
+            // The 5 * Vector test should make sure that we're exercising all possible
+            // code paths across both implementations. The sizeof(char) is because we're
+            // specifying element count, but underlying implementation reintepret casts to bytes.
+            //
+            // Use U+0123 instead of U+0080 for this test because if our implementation
+            // uses pminuw / pmovmskb incorrectly, U+0123 will incorrectly show up as ASCII,
+            // causing our test to produce a false negative.
+
+            using (BoundedMemory<char> mem = BoundedMemory.Allocate<char>(5 * Vector<byte>.Count / sizeof(char)))
+            {
+                Span<char> chars = mem.Span;
+
+                for (int i = 0; i < chars.Length; i++)
+                {
+                    chars[i] &= '\u007F'; // make sure each char (of the pre-populated random data) is ASCII
+                }
+
+                for (int i = chars.Length; i >= 0; i--)
+                {
+                    Assert.Equal(i, CallGetIndexOfFirstNonAsciiChar(chars.Slice(0, i)));
+                }
+
+                // Then, try it with non-ASCII bytes.
+
+                for (int i = chars.Length; i >= 1; i--)
+                {
+                    chars[i - 1] = '\u0123'; // set non-ASCII
+                    Assert.Equal(i - 1, CallGetIndexOfFirstNonAsciiChar(chars.Slice(0, i)));
+                }
+            }
+        }
+
+        [Fact]
+        public static void WidenAsciiToUtf16_EmptyInput_NullReferences()
+        {
+            Assert.Equal(UIntPtr.Zero, (UIntPtr)ASCIIUtility.WidenAsciiToUtf16(null, null, UIntPtr.Zero));
+        }
+
+        [Fact]
+        public static void WidenAsciiToUtf16_EmptyInput_NonNullReference()
+        {
+            byte b = default;
+            char c = default;
+            Assert.Equal(UIntPtr.Zero, (UIntPtr)ASCIIUtility.WidenAsciiToUtf16(&b, &c, UIntPtr.Zero));
+        }
+
+        [Fact]
+        public static void WidenAsciiToUtf16_AllAsciiInput()
+        {
+            using BoundedMemory<byte> asciiMem = BoundedMemory.Allocate<byte>(128);
+            using BoundedMemory<char> utf16Mem = BoundedMemory.Allocate<char>(128);
+
+            // Fill source with 00 .. 7F, then trap future writes.
+
+            Span<byte> asciiSpan = asciiMem.Span;
+            for (int i = 0; i < asciiSpan.Length; i++)
+            {
+                asciiSpan[i] = (byte)i;
+            }
+            asciiMem.MakeReadonly();
+
+            // We'll write to the UTF-16 span.
+            // We test with a variety of span lengths to test alignment and fallthrough code paths.
+
+            Span<char> utf16Span = utf16Mem.Span;
+
+            for (int i = 0; i < asciiSpan.Length; i++)
+            {
+                utf16Span.Clear(); // remove any data from previous iteration
+
+                // First, validate that the workhorse saw the incoming data as all-ASCII.
+
+                Assert.Equal(128 - i, CallWidenAsciiToUtf16(asciiSpan.Slice(i), utf16Span.Slice(i)));
+
+                // Then, validate that the data was transcoded properly.
+
+                for (int j = i; j < 128; j++)
+                {
+                    Assert.Equal((ushort)asciiSpan[i], (ushort)utf16Span[i]);
+                }
+            }
+        }
+
+        [Fact]
+        public static void WidenAsciiToUtf16_SomeNonAsciiInput()
+        {
+            using BoundedMemory<byte> asciiMem = BoundedMemory.Allocate<byte>(128);
+            using BoundedMemory<char> utf16Mem = BoundedMemory.Allocate<char>(128);
+
+            // Fill source with 00 .. 7F, then trap future writes.
+
+            Span<byte> asciiSpan = asciiMem.Span;
+            for (int i = 0; i < asciiSpan.Length; i++)
+            {
+                asciiSpan[i] = (byte)i;
+            }
+
+            // We'll write to the UTF-16 span.
+
+            Span<char> utf16Span = utf16Mem.Span;
+
+            for (int i = asciiSpan.Length - 1; i >= 0; i--)
+            {
+                RandomNumberGenerator.Fill(MemoryMarshal.Cast<char, byte>(utf16Span)); // fill with garbage
+
+                // First, keep track of the garbage we wrote to the destination.
+                // We want to ensure it wasn't overwritten.
+
+                char[] expectedTrailingData = utf16Span.Slice(i).ToArray();
+
+                // Then, set the desired byte as non-ASCII, then check that the workhorse
+                // correctly saw the data as non-ASCII.
+
+                asciiSpan[i] |= (byte)0x80;
+                Assert.Equal(i, CallWidenAsciiToUtf16(asciiSpan, utf16Span));
+
+                // Next, validate that the ASCII data was transcoded properly.
+
+                for (int j = 0; j < i; j++)
+                {
+                    Assert.Equal((ushort)asciiSpan[j], (ushort)utf16Span[j]);
+                }
+
+                // Finally, validate that the trailing data wasn't overwritten with non-ASCII data.
+
+                Assert.Equal(expectedTrailingData, utf16Span.Slice(i).ToArray());
+            }
+        }
+
+        [Fact]
+        public static unsafe void NarrowUtf16ToAscii_EmptyInput_NullReferences()
+        {
+            Assert.Equal(UIntPtr.Zero, (UIntPtr)ASCIIUtility.NarrowUtf16ToAscii(null, null, UIntPtr.Zero));
+        }
+
+        [Fact]
+        public static void NarrowUtf16ToAscii_EmptyInput_NonNullReference()
+        {
+            char c = default;
+            byte b = default;
+            Assert.Equal(UIntPtr.Zero, (UIntPtr)ASCIIUtility.NarrowUtf16ToAscii(&c, &b, UIntPtr.Zero));
+        }
+
+        [Fact]
+        public static void NarrowUtf16ToAscii_AllAsciiInput()
+        {
+            using BoundedMemory<char> utf16Mem = BoundedMemory.Allocate<char>(128);
+            using BoundedMemory<byte> asciiMem = BoundedMemory.Allocate<byte>(128);
+
+            // Fill source with 00 .. 7F.
+
+            Span<char> utf16Span = utf16Mem.Span;
+            for (int i = 0; i < utf16Span.Length; i++)
+            {
+                utf16Span[i] = (char)i;
+            }
+            utf16Mem.MakeReadonly();
+
+            // We'll write to the ASCII span.
+            // We test with a variety of span lengths to test alignment and fallthrough code paths.
+
+            Span<byte> asciiSpan = asciiMem.Span;
+
+            for (int i = 0; i < utf16Span.Length; i++)
+            {
+                asciiSpan.Clear(); // remove any data from previous iteration
+
+                // First, validate that the workhorse saw the incoming data as all-ASCII.
+
+                Assert.Equal(128 - i, CallNarrowUtf16ToAscii(utf16Span.Slice(i), asciiSpan.Slice(i)));
+
+                // Then, validate that the data was transcoded properly.
+
+                for (int j = i; j < 128; j++)
+                {
+                    Assert.Equal((ushort)utf16Span[i], (ushort)asciiSpan[i]);
+                }
+            }
+        }
+
+        [Fact]
+        public static void NarrowUtf16ToAscii_SomeNonAsciiInput()
+        {
+            using BoundedMemory<char> utf16Mem = BoundedMemory.Allocate<char>(128);
+            using BoundedMemory<byte> asciiMem = BoundedMemory.Allocate<byte>(128);
+
+            // Fill source with 00 .. 7F.
+
+            Span<char> utf16Span = utf16Mem.Span;
+            for (int i = 0; i < utf16Span.Length; i++)
+            {
+                utf16Span[i] = (char)i;
+            }
+
+            // We'll write to the ASCII span.
+
+            Span<byte> asciiSpan = asciiMem.Span;
+
+            for (int i = utf16Span.Length - 1; i >= 0; i--)
+            {
+                RandomNumberGenerator.Fill(asciiSpan); // fill with garbage
+
+                // First, keep track of the garbage we wrote to the destination.
+                // We want to ensure it wasn't overwritten.
+
+                byte[] expectedTrailingData = asciiSpan.Slice(i).ToArray();
+
+                // Then, set the desired byte as non-ASCII, then check that the workhorse
+                // correctly saw the data as non-ASCII.
+
+                utf16Span[i] = '\u0123'; // use U+0123 instead of U+0080 since it catches inappropriate pmovmskb usage
+                Assert.Equal(i, CallNarrowUtf16ToAscii(utf16Span, asciiSpan));
+
+                // Next, validate that the ASCII data was transcoded properly.
+
+                for (int j = 0; j < i; j++)
+                {
+                    Assert.Equal((ushort)utf16Span[j], (ushort)asciiSpan[j]);
+                }
+
+                // Finally, validate that the trailing data wasn't overwritten with non-ASCII data.
+
+                Assert.Equal(expectedTrailingData, asciiSpan.Slice(i).ToArray());
+            }
+        }
+
+        private static int CallGetIndexOfFirstNonAsciiByte(ReadOnlySpan<byte> buffer)
+        {
+            fixed (byte* pBuffer = &MemoryMarshal.GetReference(buffer))
+            {
+                // Conversions between UIntPtr <-> int are not checked by default.
+                return checked((int)ASCIIUtility.GetIndexOfFirstNonAsciiByte(pBuffer, (UIntPtr)buffer.Length));
+            }
+        }
+
+        private static int CallGetIndexOfFirstNonAsciiChar(ReadOnlySpan<char> buffer)
+        {
+            fixed (char* pBuffer = &MemoryMarshal.GetReference(buffer))
+            {
+                // Conversions between UIntPtr <-> int are not checked by default.
+                return checked((int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pBuffer, (UIntPtr)buffer.Length));
+            }
+        }
+
+        private static int CallNarrowUtf16ToAscii(ReadOnlySpan<char> utf16, Span<byte> ascii)
+        {
+            Assert.Equal(utf16.Length, ascii.Length);
+
+            fixed (char* pUtf16 = &MemoryMarshal.GetReference(utf16))
+            fixed (byte* pAscii = &MemoryMarshal.GetReference(ascii))
+            {
+                // Conversions between UIntPtr <-> int are not checked by default.
+                return checked((int)ASCIIUtility.NarrowUtf16ToAscii(pUtf16, pAscii, (UIntPtr)utf16.Length));
+            }
+        }
+
+        private static int CallWidenAsciiToUtf16(ReadOnlySpan<byte> ascii, Span<char> utf16)
+        {
+            Assert.Equal(ascii.Length, utf16.Length);
+
+            fixed (byte* pAscii = &MemoryMarshal.GetReference(ascii))
+            fixed (char* pUtf16 = &MemoryMarshal.GetReference(utf16))
+            {
+                // Conversions between UIntPtr <-> int are not checked by default.
+                return checked((int)ASCIIUtility.WidenAsciiToUtf16(pAscii, pUtf16, (UIntPtr)ascii.Length));
+            }
+        }
+    }
+}
+#endif
diff --git a/test/DotNetty.Common.Tests/Internal/CoreLib/BoundedMemory.Creation.cs b/test/DotNetty.Common.Tests/Internal/CoreLib/BoundedMemory.Creation.cs
new file mode 100644
index 000000000..9583dc0fc
--- /dev/null
+++ b/test/DotNetty.Common.Tests/Internal/CoreLib/BoundedMemory.Creation.cs
@@ -0,0 +1,95 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#if CORELIBTEST
+using System;
+using System.Runtime.InteropServices;
+
+namespace DotNetty.Common.Tests.Internal.CoreLib
+{
+    /// <summary>
+    /// Contains factory methods to create <see cref="BoundedMemory{T}"/> instances.
+    /// </summary>
+    public static partial class BoundedMemory
+    {
+        /// <summary>
+        /// Allocates a new <see cref="BoundedMemory{T}"/> region which is immediately preceded by
+        /// or immediately followed by a poison (MEM_NOACCESS) page. If <paramref name="placement"/>
+        /// is <see cref="PoisonPagePlacement.Before"/>, then attempting to read the memory
+        /// immediately before the returned <see cref="BoundedMemory{T}"/> will result in an AV.
+        /// If <paramref name="placement"/> is <see cref="PoisonPagePlacement.After"/>, then
+        /// attempting to read the memory immediately after the returned <see cref="BoundedMemory{T}"/>
+        /// will result in AV.
+        /// </summary>
+        /// <remarks>
+        /// The newly-allocated memory will be populated with random data.
+        /// </remarks>
+        public static BoundedMemory<T> Allocate<T>(int elementCount, PoisonPagePlacement placement = PoisonPagePlacement.After) where T : unmanaged
+        {
+            if (elementCount < 0)
+            {
+                throw new ArgumentOutOfRangeException(nameof(elementCount));
+            }
+            if (placement != PoisonPagePlacement.Before && placement != PoisonPagePlacement.After)
+            {
+                throw new ArgumentOutOfRangeException(nameof(placement));
+            }
+
+            var retVal = AllocateWithoutDataPopulation<T>(elementCount, placement);
+            FillRandom(MemoryMarshal.AsBytes(retVal.Span));
+            return retVal;
+        }
+
+        /// <summary>
+        /// Similar to <see cref="Allocate(int, PoisonPagePlacement)"/>, but populates the allocated
+        /// native memory block from existing data rather than using random data.
+        /// </summary>
+        public static BoundedMemory<T> AllocateFromExistingData<T>(ReadOnlySpan<T> data, PoisonPagePlacement placement = PoisonPagePlacement.After) where T : unmanaged
+        {
+            if (placement != PoisonPagePlacement.Before && placement != PoisonPagePlacement.After)
+            {
+                throw new ArgumentOutOfRangeException(nameof(placement));
+            }
+
+            var retVal = AllocateWithoutDataPopulation<T>(data.Length, placement);
+            data.CopyTo(retVal.Span);
+            return retVal;
+        }
+
+        /// <summary>
+        /// Similar to <see cref="Allocate(int, PoisonPagePlacement)"/>, but populates the allocated
+        /// native memory block from existing data rather than using random data.
+        /// </summary>
+        public static BoundedMemory<T> AllocateFromExistingData<T>(T[] data, PoisonPagePlacement placement = PoisonPagePlacement.After) where T : unmanaged
+        {
+            return AllocateFromExistingData(new ReadOnlySpan<T>(data), placement);
+        }
+
+        private static void FillRandom(Span<byte> buffer)
+        {
+            // Loop over a Random instance manually since Random.NextBytes(Span<byte>) doesn't
+            // exist on all platforms we target.
+
+            Random random = new Random(); // doesn't need to be cryptographically strong
+
+            for (int i = 0; i < buffer.Length; i++)
+            {
+                buffer[i] = (byte)random.Next();
+            }
+        }
+
+        private static BoundedMemory<T> AllocateWithoutDataPopulation<T>(int elementCount, PoisonPagePlacement placement) where T : unmanaged
+        {
+            if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
+            {
+                return AllocateWithoutDataPopulationWindows<T>(elementCount, placement);
+            }
+            else
+            {
+                return AllocateWithoutDataPopulationUnix<T>(elementCount, placement);
+            }
+        }
+    }
+}
+#endif
diff --git a/test/DotNetty.Common.Tests/Internal/CoreLib/BoundedMemory.Unix.cs b/test/DotNetty.Common.Tests/Internal/CoreLib/BoundedMemory.Unix.cs
new file mode 100644
index 000000000..8ab9477d7
--- /dev/null
+++ b/test/DotNetty.Common.Tests/Internal/CoreLib/BoundedMemory.Unix.cs
@@ -0,0 +1,50 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#if CORELIBTEST
+using System;
+
+namespace DotNetty.Common.Tests.Internal.CoreLib
+{
+    public static partial class BoundedMemory
+    {
+        private static UnixImplementation<T> AllocateWithoutDataPopulationUnix<T>(int elementCount, PoisonPagePlacement placement) where T : unmanaged
+        {
+            // On non-Windows platforms, we don't yet have support for changing the permissions of individual pages.
+            return new UnixImplementation<T>(elementCount);
+        }
+
+        private sealed class UnixImplementation<T> : BoundedMemory<T> where T : unmanaged
+        {
+            private readonly T[] _buffer;
+
+            public UnixImplementation(int elementCount)
+            {
+                _buffer = new T[elementCount];
+            }
+
+            public override bool IsReadonly => false;
+
+            public override Memory<T> Memory => _buffer;
+
+            public override Span<T> Span => _buffer;
+
+            public override void Dispose()
+            {
+                // no-op
+            }
+
+            public override void MakeReadonly()
+            {
+                // no-op
+            }
+
+            public override void MakeWriteable()
+            {
+                // no-op
+            }
+        }
+    }
+}
+#endif
diff --git a/test/DotNetty.Common.Tests/Internal/CoreLib/BoundedMemory.Windows.cs b/test/DotNetty.Common.Tests/Internal/CoreLib/BoundedMemory.Windows.cs
new file mode 100644
index 000000000..f42163c1d
--- /dev/null
+++ b/test/DotNetty.Common.Tests/Internal/CoreLib/BoundedMemory.Windows.cs
@@ -0,0 +1,335 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#if CORELIBTEST
+using System;
+using System.Buffers;
+using System.Runtime.ConstrainedExecution;
+using System.Runtime.InteropServices;
+using System.Security;
+
+namespace DotNetty.Common.Tests.Internal.CoreLib
+{
+    public static unsafe partial class BoundedMemory
+    {
+        private static readonly int SystemPageSize = Environment.SystemPageSize;
+
+        private static WindowsImplementation<T> AllocateWithoutDataPopulationWindows<T>(int elementCount, PoisonPagePlacement placement) where T : unmanaged
+        {
+            long cb, totalBytesToAllocate;
+            checked
+            {
+                cb = elementCount * sizeof(T);
+                totalBytesToAllocate = cb;
+
+                // We only need to round the count up if it's not an exact multiple
+                // of the system page size.
+
+                var leftoverBytes = totalBytesToAllocate % SystemPageSize;
+                if (leftoverBytes != 0)
+                {
+                    totalBytesToAllocate += SystemPageSize - leftoverBytes;
+                }
+
+                // Finally, account for the poison pages at the front and back.
+
+                totalBytesToAllocate += 2 * SystemPageSize;
+            }
+
+            // Reserve and commit the entire range as NOACCESS.
+
+            var handle = UnsafeNativeMethods.VirtualAlloc(
+                lpAddress: IntPtr.Zero,
+                dwSize: (IntPtr)totalBytesToAllocate /* cast throws OverflowException if out of range */,
+                flAllocationType: VirtualAllocAllocationType.MEM_RESERVE | VirtualAllocAllocationType.MEM_COMMIT,
+                flProtect: VirtualAllocProtection.PAGE_NOACCESS);
+
+            if (handle == null || handle.IsInvalid)
+            {
+                Marshal.ThrowExceptionForHR(Marshal.GetHRForLastWin32Error());
+                throw new InvalidOperationException("VirtualAlloc failed unexpectedly.");
+            }
+
+            // Done allocating! Now carve out a READWRITE section bookended by the NOACCESS
+            // pages and return that carved-out section to the caller. Since memory protection
+            // flags only apply at page-level granularity, we need to "left-align" or "right-
+            // align" the section we carve out so that it's guaranteed adjacent to one of
+            // the NOACCESS bookend pages.
+
+            return new WindowsImplementation<T>(
+                handle: handle,
+                byteOffsetIntoHandle: (placement == PoisonPagePlacement.Before)
+                    ? SystemPageSize /* just after leading poison page */
+                    : checked((int)(totalBytesToAllocate - SystemPageSize - cb)) /* just before trailing poison page */,
+                elementCount: elementCount)
+            {
+                Protection = VirtualAllocProtection.PAGE_READWRITE
+            };
+        }
+
+        private sealed class WindowsImplementation<T> : BoundedMemory<T> where T : unmanaged
+        {
+            private readonly VirtualAllocHandle _handle;
+            private readonly int _byteOffsetIntoHandle;
+            private readonly int _elementCount;
+            private readonly BoundedMemoryManager _memoryManager;
+
+            internal WindowsImplementation(VirtualAllocHandle handle, int byteOffsetIntoHandle, int elementCount)
+            {
+                _handle = handle;
+                _byteOffsetIntoHandle = byteOffsetIntoHandle;
+                _elementCount = elementCount;
+                _memoryManager = new BoundedMemoryManager(this);
+            }
+
+            public override bool IsReadonly => (Protection != VirtualAllocProtection.PAGE_READWRITE);
+
+            internal VirtualAllocProtection Protection
+            {
+                get
+                {
+                    bool refAdded = false;
+                    try
+                    {
+                        _handle.DangerousAddRef(ref refAdded);
+                        if (UnsafeNativeMethods.VirtualQuery(
+                            lpAddress: _handle.DangerousGetHandle() + _byteOffsetIntoHandle,
+                            lpBuffer: out var memoryInfo,
+                            dwLength: (IntPtr)sizeof(MEMORY_BASIC_INFORMATION)) == IntPtr.Zero)
+                        {
+                            Marshal.ThrowExceptionForHR(Marshal.GetHRForLastWin32Error());
+                            throw new InvalidOperationException("VirtualQuery failed unexpectedly.");
+                        }
+                        return memoryInfo.Protect;
+                    }
+                    finally
+                    {
+                        if (refAdded)
+                        {
+                            _handle.DangerousRelease();
+                        }
+                    }
+                }
+                set
+                {
+                    if (_elementCount > 0)
+                    {
+                        bool refAdded = false;
+                        try
+                        {
+                            _handle.DangerousAddRef(ref refAdded);
+                            if (!UnsafeNativeMethods.VirtualProtect(
+                                lpAddress: _handle.DangerousGetHandle() + _byteOffsetIntoHandle,
+                                dwSize: (IntPtr)(&((T*)null)[_elementCount]),
+                                flNewProtect: value,
+                                lpflOldProtect: out _))
+                            {
+                                Marshal.ThrowExceptionForHR(Marshal.GetHRForLastWin32Error());
+                                throw new InvalidOperationException("VirtualProtect failed unexpectedly.");
+                            }
+                        }
+                        finally
+                        {
+                            if (refAdded)
+                            {
+                                _handle.DangerousRelease();
+                            }
+                        }
+                    }
+                }
+            }
+
+            public override Memory<T> Memory => _memoryManager.Memory;
+
+            public override Span<T> Span
+            {
+                get
+                {
+                    bool refAdded = false;
+                    try
+                    {
+                        _handle.DangerousAddRef(ref refAdded);
+                        return new Span<T>((void*)(_handle.DangerousGetHandle() + _byteOffsetIntoHandle), _elementCount);
+                    }
+                    finally
+                    {
+                        if (refAdded)
+                        {
+                            _handle.DangerousRelease();
+                        }
+                    }
+                }
+            }
+
+            public override void Dispose()
+            {
+                _handle.Dispose();
+            }
+
+            public override void MakeReadonly()
+            {
+                Protection = VirtualAllocProtection.PAGE_READONLY;
+            }
+
+            public override void MakeWriteable()
+            {
+                Protection = VirtualAllocProtection.PAGE_READWRITE;
+            }
+
+            private sealed class BoundedMemoryManager : MemoryManager<T>
+            {
+                private readonly WindowsImplementation<T> _impl;
+
+                public BoundedMemoryManager(WindowsImplementation<T> impl)
+                {
+                    _impl = impl;
+                }
+
+                public override Memory<T> Memory => CreateMemory(_impl._elementCount);
+
+                protected override void Dispose(bool disposing)
+                {
+                    // no-op; the handle will be disposed separately
+                }
+
+                public override Span<T> GetSpan()
+                {
+                    throw new NotImplementedException();
+                }
+
+                public override MemoryHandle Pin(int elementIndex)
+                {
+                    if ((uint)elementIndex > (uint)_impl._elementCount)
+                    {
+                        throw new ArgumentOutOfRangeException(paramName: nameof(elementIndex));
+                    }
+
+                    bool refAdded = false;
+                    try
+                    {
+                        _impl._handle.DangerousAddRef(ref refAdded);
+                        return new MemoryHandle((T*)(_impl._handle.DangerousGetHandle() + _impl._byteOffsetIntoHandle) + elementIndex);
+                    }
+                    finally
+                    {
+                        if (refAdded)
+                        {
+                            _impl._handle.DangerousRelease();
+                        }
+                    }
+                }
+
+                public override void Unpin()
+                {
+                    // no-op - we don't unpin native memory
+                }
+            }
+        }
+
+        // from winnt.h
+        [Flags]
+        private enum VirtualAllocAllocationType : uint
+        {
+            MEM_COMMIT = 0x1000,
+            MEM_RESERVE = 0x2000,
+            MEM_DECOMMIT = 0x4000,
+            MEM_RELEASE = 0x8000,
+            MEM_FREE = 0x10000,
+            MEM_PRIVATE = 0x20000,
+            MEM_MAPPED = 0x40000,
+            MEM_RESET = 0x80000,
+            MEM_TOP_DOWN = 0x100000,
+            MEM_WRITE_WATCH = 0x200000,
+            MEM_PHYSICAL = 0x400000,
+            MEM_ROTATE = 0x800000,
+            MEM_LARGE_PAGES = 0x20000000,
+            MEM_4MB_PAGES = 0x80000000,
+        }
+
+        // from winnt.h
+        [Flags]
+        private enum VirtualAllocProtection : uint
+        {
+            PAGE_NOACCESS = 0x01,
+            PAGE_READONLY = 0x02,
+            PAGE_READWRITE = 0x04,
+            PAGE_WRITECOPY = 0x08,
+            PAGE_EXECUTE = 0x10,
+            PAGE_EXECUTE_READ = 0x20,
+            PAGE_EXECUTE_READWRITE = 0x40,
+            PAGE_EXECUTE_WRITECOPY = 0x80,
+            PAGE_GUARD = 0x100,
+            PAGE_NOCACHE = 0x200,
+            PAGE_WRITECOMBINE = 0x400,
+        }
+
+        [StructLayout(LayoutKind.Sequential)]
+        private struct MEMORY_BASIC_INFORMATION
+        {
+            public IntPtr BaseAddress;
+            public IntPtr AllocationBase;
+            public VirtualAllocProtection AllocationProtect;
+            public IntPtr RegionSize;
+            public VirtualAllocAllocationType State;
+            public VirtualAllocProtection Protect;
+            public VirtualAllocAllocationType Type;
+        };
+
+        private sealed class VirtualAllocHandle : SafeHandle
+        {
+            // Called by P/Invoke when returning SafeHandles
+            private VirtualAllocHandle()
+                : base(IntPtr.Zero, ownsHandle: true)
+            {
+            }
+
+            // Do not provide a finalizer - SafeHandle's critical finalizer will
+            // call ReleaseHandle for you.
+
+            public override bool IsInvalid => (handle == IntPtr.Zero);
+
+            protected override bool ReleaseHandle() =>
+                UnsafeNativeMethods.VirtualFree(handle, IntPtr.Zero, VirtualAllocAllocationType.MEM_RELEASE);
+        }
+
+        [SuppressUnmanagedCodeSecurity]
+        private static class UnsafeNativeMethods
+        {
+            private const string KERNEL32_LIB = "kernel32.dll";
+
+            // https://msdn.microsoft.com/en-us/library/windows/desktop/aa366887(v=vs.85).aspx
+            [DllImport(KERNEL32_LIB, CallingConvention = CallingConvention.Winapi, SetLastError = true)]
+            public static extern VirtualAllocHandle VirtualAlloc(
+                [In] IntPtr lpAddress,
+                [In] IntPtr dwSize,
+                [In] VirtualAllocAllocationType flAllocationType,
+                [In] VirtualAllocProtection flProtect);
+
+            // https://msdn.microsoft.com/en-us/library/windows/desktop/aa366892(v=vs.85).aspx
+            [DllImport(KERNEL32_LIB, CallingConvention = CallingConvention.Winapi, SetLastError = true)]
+            [return: MarshalAs(UnmanagedType.Bool)]
+            public static extern bool VirtualFree(
+                [In] IntPtr lpAddress,
+                [In] IntPtr dwSize,
+                [In] VirtualAllocAllocationType dwFreeType);
+
+            // https://msdn.microsoft.com/en-us/library/windows/desktop/aa366898(v=vs.85).aspx
+            [DllImport(KERNEL32_LIB, CallingConvention = CallingConvention.Winapi, SetLastError = true)]
+            [return: MarshalAs(UnmanagedType.Bool)]
+            public static extern bool VirtualProtect(
+                [In] IntPtr lpAddress,
+                [In] IntPtr dwSize,
+                [In] VirtualAllocProtection flNewProtect,
+                [Out] out VirtualAllocProtection lpflOldProtect);
+
+            // https://msdn.microsoft.com/en-us/library/windows/desktop/aa366902(v=vs.85).aspx
+            [DllImport(KERNEL32_LIB, CallingConvention = CallingConvention.Winapi, SetLastError = true)]
+            public static extern IntPtr VirtualQuery(
+                [In] IntPtr lpAddress,
+                [Out] out MEMORY_BASIC_INFORMATION lpBuffer,
+                [In] IntPtr dwLength);
+        }
+    }
+}
+#endif
diff --git a/test/DotNetty.Common.Tests/Internal/CoreLib/BoundedMemory.cs b/test/DotNetty.Common.Tests/Internal/CoreLib/BoundedMemory.cs
new file mode 100644
index 000000000..14d8cb1fe
--- /dev/null
+++ b/test/DotNetty.Common.Tests/Internal/CoreLib/BoundedMemory.cs
@@ -0,0 +1,53 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#if CORELIBTEST
+using System;
+
+namespace DotNetty.Common.Tests.Internal.CoreLib
+{
+    /// <summary>
+    /// Represents a region of native memory. The <see cref="Memory"/> property can be used
+    /// to get a <see cref="Memory{Byte}"/> backed by this memory region.
+    /// </summary>
+    public abstract class BoundedMemory<T> : IDisposable where T : unmanaged
+    {
+        /// <summary>
+        /// Returns a value stating whether this native memory block is readonly.
+        /// </summary>
+        public abstract bool IsReadonly { get; }
+
+        /// <summary>
+        /// Gets the <see cref="Memory{Byte}"/> which represents this native memory.
+        /// This <see cref="BoundedMemory{T}"/> instance must be kept alive while working with the <see cref="Memory{Byte}"/>.
+        /// </summary>
+        public abstract Memory<T> Memory { get; }
+
+        /// <summary>
+        /// Gets the <see cref="Span{Byte}"/> which represents this native memory.
+        /// This <see cref="BoundedMemory{T}"/> instance must be kept alive while working with the <see cref="Span{Byte}"/>.
+        /// </summary>
+        public abstract Span<T> Span { get; }
+
+        /// <summary>
+        /// Disposes this <see cref="BoundedMemory{T}"/> instance.
+        /// </summary>
+        public abstract void Dispose();
+
+        /// <summary>
+        /// Sets this native memory block to be readonly. Writes to this block will cause an AV.
+        /// This method has no effect if the memory block is zero length or if the underlying
+        /// OS does not support marking the memory block as readonly.
+        /// </summary>
+        public abstract void MakeReadonly();
+
+        /// <summary>
+        /// Sets this native memory block to be read+write.
+        /// This method has no effect if the memory block is zero length or if the underlying
+        /// OS does not support marking the memory block as read+write.
+        /// </summary>
+        public abstract void MakeWriteable();
+    }
+}
+#endif
diff --git a/test/DotNetty.Common.Tests/Internal/CoreLib/PoisonPagePlacement.cs b/test/DotNetty.Common.Tests/Internal/CoreLib/PoisonPagePlacement.cs
new file mode 100644
index 000000000..e1cc54e3d
--- /dev/null
+++ b/test/DotNetty.Common.Tests/Internal/CoreLib/PoisonPagePlacement.cs
@@ -0,0 +1,28 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#if CORELIBTEST
+namespace DotNetty.Common.Tests.Internal.CoreLib
+{
+    /// <summary>
+    /// Dictates where the poison page should be placed.
+    /// </summary>
+    public enum PoisonPagePlacement
+    {
+        /// <summary>
+        /// The poison page should be placed immediately after the memory region.
+        /// Attempting to access the memory page immediately following the
+        /// span will result in an AV.
+        /// </summary>
+        After,
+
+        /// <summary>
+        /// The poison page should be placed immediately before the memory region.
+        /// Attempting to access the memory page immediately before the
+        /// span will result in an AV.
+        /// </summary>
+        Before,
+    }
+}
+#endif
diff --git a/test/DotNetty.Common.Tests/Internal/CoreLib/Utf16UtilityTests.ValidateChars.cs b/test/DotNetty.Common.Tests/Internal/CoreLib/Utf16UtilityTests.ValidateChars.cs
new file mode 100644
index 000000000..b0635ea9a
--- /dev/null
+++ b/test/DotNetty.Common.Tests/Internal/CoreLib/Utf16UtilityTests.ValidateChars.cs
@@ -0,0 +1,267 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#if CORELIBTEST
+using System;
+using System.Buffers;
+using System.Globalization;
+using System.Linq;
+using System.Numerics;
+using System.Runtime.InteropServices;
+using DotNetty.Common.Internal;
+using Xunit;
+
+namespace DotNetty.Common.Tests.Internal.CoreLib
+{
+    public class Utf16UtilityTests
+    {
+        [Theory]
+        [InlineData("", 0, 0)] // empty string is OK
+        [InlineData("X", 1, 1)]
+        [InlineData("XY", 2, 2)]
+        [InlineData("XYZ", 3, 3)]
+        [InlineData("<EACU>", 1, 2)]
+        [InlineData("X<EACU>", 2, 3)]
+        [InlineData("<EACU>X", 2, 3)]
+        [InlineData("<EURO>", 1, 3)]
+        [InlineData("<GRIN>", 1, 4)]
+        [InlineData("X<GRIN>Z", 3, 6)]
+        [InlineData("X<0000>Z", 3, 3)] // null chars are allowed
+        public void GetIndexOfFirstInvalidUtf16Sequence_WithSmallValidBuffers(string unprocessedInput, int expectedRuneCount, int expectedUtf8ByteCount)
+        {
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(unprocessedInput, -1 /* expectedIdxOfFirstInvalidChar */, expectedRuneCount, expectedUtf8ByteCount);
+        }
+
+        [Theory]
+        [InlineData("<DC00>", 0, 0, 0)] // standalone low surrogate (at beginning of sequence)
+        [InlineData("X<DC00>", 1, 1, 1)] // standalone low surrogate (preceded by valid ASCII data)
+        [InlineData("<EURO><DC00>", 1, 1, 3)] // standalone low surrogate (preceded by valid non-ASCII data)
+        [InlineData("<D800>", 0, 0, 0)] // standalone high surrogate (missing follow-up low surrogate)
+        [InlineData("<D800>Y", 0, 0, 0)] // standalone high surrogate (followed by ASCII char)
+        [InlineData("<D800><D800>", 0, 0, 0)] // standalone high surrogate (followed by high surrogate)
+        [InlineData("<D800><EURO>", 0, 0, 0)] // standalone high surrogate (followed by valid non-ASCII char)
+        [InlineData("<DC00><DC00>", 0, 0, 0)] // standalone low surrogate (not preceded by a high surrogate)
+        [InlineData("<DC00><D800>", 0, 0, 0)] // standalone low surrogate (not preceded by a high surrogate)
+        [InlineData("<GRIN><DC00><DC00>", 2, 1, 4)] // standalone low surrogate (preceded by a valid surrogate pair)
+        [InlineData("<GRIN><DC00><D800>", 2, 1, 4)] // standalone low surrogate (preceded by a valid surrogate pair)
+        [InlineData("<GRIN><0000><DC00><D800>", 3, 2, 5)] // standalone low surrogate (preceded by a valid null char)
+        public void GetIndexOfFirstInvalidUtf16Sequence_WithSmallInvalidBuffers(string unprocessedInput, int idxOfFirstInvalidChar, int expectedRuneCount, int expectedUtf8ByteCount)
+        {
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(unprocessedInput, idxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+        }
+
+        [Theory] // chars below presented as hex since Xunit doesn't like invalid UTF-16 string literals
+        [InlineData("<2BB4><218C><1BC0><613F><F9E9><B740><DE38><E689>", 6, 6, 18)]
+        [InlineData("<1854><C980><012C><4797><DD5A><41D0><A104><5464>", 4, 4, 11)]
+        [InlineData("<F1AF><8BD3><5037><BE29><DEFF><3E3A><DD71><6336>", 4, 4, 12)]
+        [InlineData("<B978><0F25><DC23><D3BB><7352><4025><0B93><4107>", 2, 2, 6)]
+        [InlineData("<887C><C980><012C><4797><DD5A><41D0><A104><5464>", 4, 4, 11)]
+        public void GetIndexOfFirstInvalidUtf16Sequence_WithEightRandomCharsContainingUnpairedSurrogates(string unprocessedInput, int idxOfFirstInvalidChar, int expectedRuneCount, int expectedUtf8ByteCount)
+        {
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(unprocessedInput, idxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf16Sequence_WithInvalidSurrogateSequences()
+        {
+            // All ASCII
+
+            char[] chars = Enumerable.Repeat('x', 128).ToArray();
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 128, expectedUtf8ByteCount: 128);
+
+            // Throw a surrogate pair at the beginning
+
+            chars[0] = '\uD800';
+            chars[1] = '\uDFFF';
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 127, expectedUtf8ByteCount: 130);
+
+            // Throw a surrogate pair near the end
+
+            chars[124] = '\uD800';
+            chars[125] = '\uDFFF';
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 126, expectedUtf8ByteCount: 132);
+
+            // Throw a standalone surrogate code point at the *very* end
+
+            chars[127] = '\uD800'; // high surrogate
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 127, expectedRuneCount: 125, expectedUtf8ByteCount: 131);
+
+            chars[127] = '\uDFFF'; // low surrogate
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 127, expectedRuneCount: 125, expectedUtf8ByteCount: 131);
+
+            // Make the final surrogate pair valid
+
+            chars[126] = '\uD800'; // high surrogate
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 125, expectedUtf8ByteCount: 134);
+
+            // Throw an invalid surrogate sequence in the middle (straddles a vector boundary)
+
+            chars[12] = '\u0080'; // 2-byte UTF-8 sequence
+            chars[13] = '\uD800'; // high surrogate
+            chars[14] = '\uD800'; // high surrogate
+            chars[15] = '\uDFFF'; // low surrogate
+            chars[16] = '\uDFFF'; // low surrogate
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 13, expectedRuneCount: 12, expectedUtf8ByteCount: 16);
+
+            // Correct the surrogate sequence we just added
+
+            chars[14] = '\uDC00'; // low surrogate
+            chars[15] = '\uDBFF'; // high surrogate
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 123, expectedUtf8ByteCount: 139);
+
+            // Corrupt the surrogate pair that's split across a vector boundary
+
+            chars[16] = 'x'; // ASCII char (remember.. chars[15] is a high surrogate char)
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 15, expectedRuneCount: 13, expectedUtf8ByteCount: 20);
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf16Sequence_WithStandaloneLowSurrogateCharAtStart()
+        {
+            // The input stream will be a vector's worth of ASCII chars, followed by a single standalone low
+            // surrogate char, then padded with U+0000 until it's a multiple of the vector size.
+            // Using Vector<ushort>.Count here as a stand-in for Vector<char>.Count.
+
+            char[] chars = new char[Vector<ushort>.Count * 2];
+            for (int i = 0; i < Vector<ushort>.Count; i++)
+            {
+                chars[i] = 'x'; // ASCII char
+            }
+
+            chars[Vector<ushort>.Count] = '\uDEAD'; // standalone low surrogate char
+
+            for (int i = 0; i <= Vector<ushort>.Count; i++)
+            {
+                // Expect all ASCII chars to be consumed, low surrogate char to be marked invalid.
+                GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars[(Vector<ushort>.Count - i)..], i, i, i);
+            }
+        }
+
+        private static void GetIndexOfFirstInvalidUtf16Sequence_Test_Core(string unprocessedInput, int expectedIdxOfFirstInvalidChar, int expectedRuneCount, long expectedUtf8ByteCount)
+        {
+            char[] processedInput = ProcessInput(unprocessedInput).ToCharArray();
+
+            // Run the test normally
+
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+
+            // Put a bunch of ASCII data at the beginning (to test the call to ASCIIUtility at method entry)
+
+            processedInput = Enumerable.Repeat('x', 128).Concat(processedInput).ToArray();
+
+            if (expectedIdxOfFirstInvalidChar >= 0)
+            {
+                expectedIdxOfFirstInvalidChar += 128;
+            }
+            expectedRuneCount += 128;
+            expectedUtf8ByteCount += 128;
+
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+
+            // Change the first few chars to a mixture of 2-byte and 3-byte UTF-8 sequences
+            // This makes sure the vectorized code paths can properly handle these.
+
+            processedInput[0] = '\u0080'; // 2-byte UTF-8 sequence
+            processedInput[1] = '\u0800'; // 3-byte UTF-8 sequence
+            processedInput[2] = '\u0080'; // 2-byte UTF-8 sequence
+            processedInput[3] = '\u0800'; // 3-byte UTF-8 sequence
+            processedInput[4] = '\u0080'; // 2-byte UTF-8 sequence
+            processedInput[5] = '\u0800'; // 3-byte UTF-8 sequence
+            processedInput[6] = '\u0080'; // 2-byte UTF-8 sequence
+            processedInput[7] = '\u0800'; // 3-byte UTF-8 sequence
+
+            expectedUtf8ByteCount += 12;
+
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+
+            // Throw some surrogate pairs into the mix to make sure they're also handled properly
+            // by the vectorized code paths.
+
+            processedInput[8] = '\u0080'; // 2-byte UTF-8 sequence
+            processedInput[9] = '\u0800'; // 3-byte UTF-8 sequence
+            processedInput[10] = '\u0080'; // 2-byte UTF-8 sequence
+            processedInput[11] = '\u0800'; // 3-byte UTF-8 sequence
+            processedInput[12] = '\u0080'; // 2-byte UTF-8 sequence
+            processedInput[13] = '\uD800'; // high surrogate
+            processedInput[14] = '\uDC00'; // low surrogate
+            processedInput[15] = 'z'; // ASCII char
+
+            expectedRuneCount--;
+            expectedUtf8ByteCount += 9;
+
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+
+            // Split the next surrogate pair across the vector boundary (so that we
+            // don't inadvertently treat this as a standalone surrogate sequence).
+
+            processedInput[15] = '\uDBFF'; // high surrogate
+            processedInput[16] = '\uDFFF'; // low surrogate
+
+            expectedRuneCount--;
+            expectedUtf8ByteCount += 2;
+
+            GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+        }
+
+        private static unsafe void GetIndexOfFirstInvalidUtf16Sequence_Test_Core(char[] input, int expectedRetVal, int expectedRuneCount, long expectedUtf8ByteCount)
+        {
+            // Arrange
+
+            using BoundedMemory<char> boundedMemory = BoundedMemory.AllocateFromExistingData(input);
+            boundedMemory.MakeReadonly();
+
+            // Act
+
+            int actualRetVal;
+            long actualUtf8CodeUnitCount;
+            int actualRuneCount;
+
+            fixed (char* pInputBuffer = &MemoryMarshal.GetReference(boundedMemory.Span))
+            {
+                char* pFirstInvalidChar = Utf16Utility.GetPointerToFirstInvalidChar(pInputBuffer, input.Length, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment);
+
+                long ptrDiff = pFirstInvalidChar - pInputBuffer;
+                Assert.True((ulong)ptrDiff <= (uint)input.Length, "ptrDiff was outside expected range.");
+
+                Assert.True(utf8CodeUnitCountAdjustment >= 0, "UTF-16 code unit count adjustment must be non-negative.");
+                Assert.True(scalarCountAdjustment <= 0, "Scalar count adjustment must be 0 or negative.");
+
+                actualRetVal = (ptrDiff == input.Length) ? -1 : (int)ptrDiff;
+
+                // The last two 'out' parameters are:
+                // a) The number to be added to the "chars processed" return value to come up with the total UTF-8 code unit count, and
+                // b) The number to be added to the "total UTF-16 code unit count" value to come up with the total scalar count.
+
+                actualUtf8CodeUnitCount = ptrDiff + utf8CodeUnitCountAdjustment;
+                actualRuneCount = (int)ptrDiff + scalarCountAdjustment;
+            }
+
+            // Assert
+
+            Assert.Equal(expectedRetVal, actualRetVal);
+            Assert.Equal(expectedRuneCount, actualRuneCount);
+            Assert.Equal(actualUtf8CodeUnitCount, expectedUtf8ByteCount);
+        }
+
+        private static string ProcessInput(string input)
+        {
+            input = input.Replace("<EACU>", "\u00E9", StringComparison.Ordinal); // U+00E9 LATIN SMALL LETTER E WITH ACUTE
+            input = input.Replace("<EURO>", "\u20AC", StringComparison.Ordinal); // U+20AC EURO SIGN
+            input = input.Replace("<GRIN>", "\U0001F600", StringComparison.Ordinal); //  U+1F600 GRINNING FACE
+
+            // Replace <ABCD> with \uABCD. This allows us to flow potentially malformed
+            // UTF-16 strings without Xunit. (The unit testing framework gets angry when
+            // we try putting invalid UTF-16 data as inline test data.)
+
+            int idx;
+            while ((idx = input.IndexOf('<')) >= 0)
+            {
+                input = input[..idx] + (char)ushort.Parse(input.Substring(idx + 1, 4), NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture) + input[(idx + 6)..];
+            }
+
+            return input;
+        }
+    }
+}
+#endif
diff --git a/test/DotNetty.Common.Tests/Internal/CoreLib/Utf8Tests.cs b/test/DotNetty.Common.Tests/Internal/CoreLib/Utf8Tests.cs
new file mode 100644
index 000000000..f0986d77e
--- /dev/null
+++ b/test/DotNetty.Common.Tests/Internal/CoreLib/Utf8Tests.cs
@@ -0,0 +1,799 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#if CORELIBTEST
+using System;
+using System.Buffers;
+using System.Collections.Generic;
+using System.Globalization;
+using System.Linq;
+using System.Text;
+using System.Text.RegularExpressions;
+using DotNetty.Common.Internal;
+using Xunit;
+
+namespace DotNetty.Common.Tests.Internal.CoreLib
+{
+    public class Utf8Tests
+    {
+        private const string X_UTF8 = "58"; // U+0058 LATIN CAPITAL LETTER X, 1 byte
+        private const string X_UTF16 = "X";
+
+        private const string Y_UTF8 = "59"; // U+0058 LATIN CAPITAL LETTER Y, 1 byte
+        private const string Y_UTF16 = "Y";
+
+        private const string Z_UTF8 = "5A"; // U+0058 LATIN CAPITAL LETTER Z, 1 byte
+        private const string Z_UTF16 = "Z";
+
+        private const string E_ACUTE_UTF8 = "C3A9"; // U+00E9 LATIN SMALL LETTER E WITH ACUTE, 2 bytes
+        private const string E_ACUTE_UTF16 = "\u00E9";
+
+        private const string EURO_SYMBOL_UTF8 = "E282AC"; // U+20AC EURO SIGN, 3 bytes
+        private const string EURO_SYMBOL_UTF16 = "\u20AC";
+
+        private const string REPLACEMENT_CHAR_UTF8 = "EFBFBD"; // U+FFFD REPLACEMENT CHAR, 3 bytes
+        private const string REPLACEMENT_CHAR_UTF16 = "\uFFFD";
+
+        private const string GRINNING_FACE_UTF8 = "F09F9880"; // U+1F600 GRINNING FACE, 4 bytes
+        private const string GRINNING_FACE_UTF16 = "\U0001F600";
+
+        private const string WOMAN_CARTWHEELING_MEDSKIN_UTF16 = "\U0001F938\U0001F3FD\u200D\u2640\uFE0F"; // U+1F938 U+1F3FD U+200D U+2640 U+FE0F WOMAN CARTWHEELING: MEDIUM SKIN TONE
+
+        // All valid scalars [ U+0000 .. U+D7FF ] and [ U+E000 .. U+10FFFF ].
+        private static readonly IEnumerable<Rune> s_allValidScalars = Enumerable.Range(0x0000, 0xD800).Concat(Enumerable.Range(0xE000, 0x110000 - 0xE000)).Select(value => new Rune(value));
+
+        private static readonly ReadOnlyMemory<char> s_allScalarsAsUtf16;
+        private static readonly ReadOnlyMemory<byte> s_allScalarsAsUtf8;
+
+        static Utf8Tests()
+        {
+            List<char> allScalarsAsUtf16 = new List<char>();
+            List<byte> allScalarsAsUtf8 = new List<byte>();
+
+            foreach (Rune rune in s_allValidScalars)
+            {
+                allScalarsAsUtf16.AddRange(ToUtf16(rune));
+                allScalarsAsUtf8.AddRange(ToUtf8(rune));
+            }
+
+            s_allScalarsAsUtf16 = allScalarsAsUtf16.ToArray().AsMemory();
+            s_allScalarsAsUtf8 = allScalarsAsUtf8.ToArray().AsMemory();
+        }
+
+        /*
+         * COMMON UTILITIES FOR UNIT TESTS
+         */
+
+        public static byte[] DecodeHex(ReadOnlySpan<char> inputHex)
+        {
+            Assert.True(Regex.IsMatch(inputHex.ToString(), "^([0-9a-fA-F]{2})*$"), "Input must be an even number of hex characters.");
+
+#if NET
+            return Convert.FromHexString(inputHex);
+#else
+            byte[] retVal = new byte[inputHex.Length / 2];
+            for (int i = 0; i < retVal.Length; i++)
+            {
+                retVal[i] = byte.Parse(inputHex.Slice(i * 2, 2), NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture);
+            }
+            return retVal;
+#endif
+        }
+
+        // !! IMPORTANT !!
+        // Don't delete this implementation, as we use it as a reference to make sure the framework's
+        // transcoding logic is correct.
+        public static byte[] ToUtf8(Rune rune)
+        {
+            Assert.True(Rune.IsValid(rune.Value), $"Rune with value U+{(uint)rune.Value:X4} is not well-formed.");
+
+            if (rune.Value < 0x80)
+            {
+                return new[]
+                {
+                    (byte)rune.Value
+                };
+            }
+            else if (rune.Value < 0x0800)
+            {
+                return new[]
+                {
+                    (byte)((rune.Value >> 6) | 0xC0),
+                    (byte)((rune.Value & 0x3F) | 0x80)
+                };
+            }
+            else if (rune.Value < 0x10000)
+            {
+                return new[]
+                {
+                    (byte)((rune.Value >> 12) | 0xE0),
+                    (byte)(((rune.Value >> 6) & 0x3F) | 0x80),
+                    (byte)((rune.Value & 0x3F) | 0x80)
+                };
+            }
+            else
+            {
+                return new[]
+                {
+                    (byte)((rune.Value >> 18) | 0xF0),
+                    (byte)(((rune.Value >> 12) & 0x3F) | 0x80),
+                    (byte)(((rune.Value >> 6) & 0x3F) | 0x80),
+                    (byte)((rune.Value & 0x3F) | 0x80)
+                };
+            }
+        }
+
+        // !! IMPORTANT !!
+        // Don't delete this implementation, as we use it as a reference to make sure the framework's
+        // transcoding logic is correct.
+        private static char[] ToUtf16(Rune rune)
+        {
+            Assert.True(Rune.IsValid(rune.Value), $"Rune with value U+{(uint)rune.Value:X4} is not well-formed.");
+
+            if (rune.IsBmp)
+            {
+                return new[]
+                {
+                    (char)rune.Value
+                };
+            }
+            else
+            {
+                return new[]
+                {
+                    (char)((rune.Value >> 10) + 0xD800 - 0x40),
+                    (char)((rune.Value & 0x03FF) + 0xDC00)
+                };
+            }
+        }
+
+        [Theory]
+        [InlineData("", "")] // empty string is OK
+        [InlineData(X_UTF16, X_UTF8)]
+        [InlineData(E_ACUTE_UTF16, E_ACUTE_UTF8)]
+        [InlineData(EURO_SYMBOL_UTF16, EURO_SYMBOL_UTF8)]
+        public void ToBytes_WithSmallValidBuffers(string utf16Input, string expectedUtf8TranscodingHex)
+        {
+            // These test cases are for the "slow processing" code path at the end of TranscodeToUtf8,
+            // so inputs should be less than 2 chars.
+
+            Assert.InRange(utf16Input.Length, 0, 1);
+
+            ToBytes_Test_Core(
+                utf16Input: utf16Input,
+                destinationSize: expectedUtf8TranscodingHex.Length / 2,
+                replaceInvalidSequences: false,
+                isFinalChunk: false,
+                expectedOperationStatus: OperationStatus.Done,
+                expectedNumCharsRead: utf16Input.Length,
+                expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex));
+        }
+
+        [Theory]
+        [InlineData("AB")] // 2 ASCII chars, hits fast inner loop
+        [InlineData("ABCD")] // 4 ASCII chars, hits fast inner loop
+        [InlineData("ABCDEF")] // 6 ASCII chars, hits fast inner loop
+        [InlineData("ABCDEFGH")] // 8 ASCII chars, hits fast inner loop
+        [InlineData("ABCDEFGHIJ")] // 10 ASCII chars, hits fast inner loop
+        [InlineData("ABCDEF" + E_ACUTE_UTF16 + "HIJ")] // interrupts inner loop due to non-ASCII char in first char of first DWORD
+        [InlineData("ABCDEFG" + EURO_SYMBOL_UTF16 + "IJ")] // interrupts inner loop due to non-ASCII char in second char of first DWORD
+        [InlineData("ABCDEFGH" + E_ACUTE_UTF16 + "J")] // interrupts inner loop due to non-ASCII char in first char of second DWORD
+        [InlineData("ABCDEFGHI" + EURO_SYMBOL_UTF16)] // interrupts inner loop due to non-ASCII char in second char of second DWORD
+        [InlineData(X_UTF16 + E_ACUTE_UTF16)] // drains first ASCII char then falls down to slow path
+        [InlineData(X_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16)] // drains first ASCII char then consumes 2x 2-byte sequences at once
+        [InlineData(E_ACUTE_UTF16 + X_UTF16)] // no first ASCII char to drain, consumes 2-byte seq followed by ASCII char
+        [InlineData(E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16)] // stay within 2x 2-byte sequence processing loop
+        [InlineData(E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + X_UTF16)] // break out of 2x 2-byte seq loop due to ASCII data in second char of DWORD
+        [InlineData(E_ACUTE_UTF16 + E_ACUTE_UTF16 + X_UTF16 + X_UTF16)] // break out of 2x 2-byte seq loop due to ASCII data in first char of DWORD
+        [InlineData(E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + EURO_SYMBOL_UTF16)] // break out of 2x 2-byte seq loop due to 3-byte data
+        [InlineData(E_ACUTE_UTF16 + EURO_SYMBOL_UTF16)] // 2-byte logic sees next char isn't ASCII, cannot read full DWORD from remaining input buffer, falls down to slow drain loop
+        [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + X_UTF16)] // 2x 3-byte logic can't read a full DWORD from next part of buffer, falls down to slow drain loop
+        [InlineData(EURO_SYMBOL_UTF16 + X_UTF16)] // 3-byte processing loop consumes trailing ASCII char, but can't read next DWORD, falls down to slow drain loop
+        [InlineData(EURO_SYMBOL_UTF16 + X_UTF16 + X_UTF16)] // 3-byte processing loop consumes trailing ASCII char, but can't read next DWORD, falls down to slow drain loop
+        [InlineData(EURO_SYMBOL_UTF16 + E_ACUTE_UTF16)] // 3-byte processing loop can't consume next ASCII char, can't read DWORD, falls down to slow drain loop
+        [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16)] // stay within 2x 3-byte sequence processing loop
+        [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + X_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16)] // consume stray ASCII char at beginning of DWORD after 2x 3-byte sequence
+        [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + X_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16)] // consume stray ASCII char at end of DWORD after 2x 3-byte sequence
+        [InlineData(EURO_SYMBOL_UTF16 + E_ACUTE_UTF16 + X_UTF16)] // consume 2-byte sequence as second char in DWORD which begins with 3-byte encoded char
+        [InlineData(EURO_SYMBOL_UTF16 + GRINNING_FACE_UTF16)] // 3-byte sequence followed by 4-byte sequence
+        [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + GRINNING_FACE_UTF16)] // 2x 3-byte sequence followed by 4-byte sequence
+        [InlineData(GRINNING_FACE_UTF16)] // single 4-byte surrogate char pair
+        [InlineData(GRINNING_FACE_UTF16 + EURO_SYMBOL_UTF16)] // 4-byte surrogate char pair, cannot read next DWORD, falls down to slow drain loop
+        public void ToBytes_WithLargeValidBuffers(string utf16Input)
+        {
+            // These test cases are for the "fast processing" code which is the main loop of TranscodeToUtf8,
+            // so inputs should be at least 2 chars.
+
+            Assert.True(utf16Input.Length >= 2);
+
+            // We're going to run the tests with destination buffer lengths ranging from 0 all the way
+            // to buffers large enough to hold the full output. This allows us to test logic that
+            // detects whether we're about to overrun our destination buffer and instead returns DestinationTooSmall.
+
+            Rune[] enumeratedScalars = utf16Input.EnumerateRunes().ToArray();
+
+            // 0-length buffer test
+            ToBytes_Test_Core(
+                utf16Input: utf16Input,
+                destinationSize: 0,
+                replaceInvalidSequences: false,
+                isFinalChunk: false,
+                expectedOperationStatus: OperationStatus.DestinationTooSmall,
+                expectedNumCharsRead: 0,
+                expectedUtf8Transcoding: ReadOnlySpan<byte>.Empty);
+
+            int expectedNumCharsConsumed = 0;
+            byte[] concatenatedUtf8 = Array.Empty<byte>();
+
+            for (int i = 0; i < enumeratedScalars.Length; i++)
+            {
+                Rune thisScalar = enumeratedScalars[i];
+
+                // provide partial destination buffers all the way up to (but not including) enough to hold the next full scalar encoding
+                for (int j = 1; j < thisScalar.Utf8SequenceLength; j++)
+                {
+                    ToBytes_Test_Core(
+                        utf16Input: utf16Input,
+                        destinationSize: concatenatedUtf8.Length + j,
+                        replaceInvalidSequences: false,
+                        isFinalChunk: false,
+                        expectedOperationStatus: OperationStatus.DestinationTooSmall,
+                        expectedNumCharsRead: expectedNumCharsConsumed,
+                        expectedUtf8Transcoding: concatenatedUtf8);
+                }
+
+                // now provide a destination buffer large enough to hold the next full scalar encoding
+
+                expectedNumCharsConsumed += thisScalar.Utf16SequenceLength;
+                concatenatedUtf8 = concatenatedUtf8.Concat(ToUtf8(thisScalar)).ToArray();
+
+                ToBytes_Test_Core(
+                   utf16Input: utf16Input,
+                   destinationSize: concatenatedUtf8.Length,
+                   replaceInvalidSequences: false,
+                   isFinalChunk: false,
+                   expectedOperationStatus: (i == enumeratedScalars.Length - 1) ? OperationStatus.Done : OperationStatus.DestinationTooSmall,
+                   expectedNumCharsRead: expectedNumCharsConsumed,
+                   expectedUtf8Transcoding: concatenatedUtf8);
+            }
+
+            // now throw lots of ASCII data at the beginning so that we exercise the vectorized code paths
+
+            utf16Input = new string('x', 64) + utf16Input;
+            concatenatedUtf8 = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray();
+
+            ToBytes_Test_Core(
+                utf16Input: utf16Input,
+                destinationSize: concatenatedUtf8.Length,
+                replaceInvalidSequences: false,
+                isFinalChunk: true,
+                expectedOperationStatus: OperationStatus.Done,
+                expectedNumCharsRead: utf16Input.Length,
+                expectedUtf8Transcoding: concatenatedUtf8);
+
+            // now throw some non-ASCII data at the beginning so that we *don't* exercise the vectorized code paths
+
+            utf16Input = WOMAN_CARTWHEELING_MEDSKIN_UTF16 + utf16Input[64..];
+            concatenatedUtf8 = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray();
+
+            ToBytes_Test_Core(
+                utf16Input: utf16Input,
+                destinationSize: concatenatedUtf8.Length,
+                replaceInvalidSequences: false,
+                isFinalChunk: true,
+                expectedOperationStatus: OperationStatus.Done,
+                expectedNumCharsRead: utf16Input.Length,
+                expectedUtf8Transcoding: concatenatedUtf8);
+        }
+
+        [Theory]
+        [InlineData('\uD800', OperationStatus.NeedMoreData)] // standalone high surrogate
+        [InlineData('\uDFFF', OperationStatus.InvalidData)] // standalone low surrogate
+        public void ToBytes_WithOnlyStandaloneSurrogates(char charValue, OperationStatus expectedOperationStatus)
+        {
+            ToBytes_Test_Core(
+                utf16Input: new[] { charValue },
+                destinationSize: 0,
+                replaceInvalidSequences: false,
+                isFinalChunk: false,
+                expectedOperationStatus: expectedOperationStatus,
+                expectedNumCharsRead: 0,
+                expectedUtf8Transcoding: Span<byte>.Empty);
+        }
+
+        [Theory]
+        [InlineData("<LOW><HIGH>", 0, "")] // swapped surrogate pair characters
+        [InlineData("A<LOW><HIGH>", 1, "41")] // consume standalone ASCII char, then swapped surrogate pair characters
+        [InlineData("A<HIGH>B", 1, "41")] // consume standalone ASCII char, then standalone high surrogate char
+        [InlineData("A<LOW>B", 1, "41")] // consume standalone ASCII char, then standalone low surrogate char
+        [InlineData("AB<HIGH><HIGH>", 2, "4142")] // consume two ASCII chars, then standalone high surrogate char
+        [InlineData("AB<LOW><LOW>", 2, "4142")] // consume two ASCII chars, then standalone low surrogate char
+        public void ToBytes_WithInvalidSurrogates(string utf16Input, int expectedNumCharsConsumed, string expectedUtf8TranscodingHex)
+        {
+            // xUnit can't handle ill-formed strings in [InlineData], so we replace here.
+
+            utf16Input = utf16Input.Replace("<HIGH>", "\uD800").Replace("<LOW>", "\uDFFF");
+
+            // These test cases are for the "fast processing" code which is the main loop of TranscodeToUtf8,
+            // so inputs should be at least 2 chars.
+
+            Assert.True(utf16Input.Length >= 2);
+
+            ToBytes_Test_Core(
+                utf16Input: utf16Input,
+                destinationSize: expectedUtf8TranscodingHex.Length / 2,
+                replaceInvalidSequences: false,
+                isFinalChunk: false,
+                expectedOperationStatus: OperationStatus.InvalidData,
+                expectedNumCharsRead: expectedNumCharsConsumed,
+                expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex));
+
+            // Now try the tests again with a larger buffer.
+            // This ensures that running out of destination space wasn't the reason we failed.
+
+            ToBytes_Test_Core(
+                utf16Input: utf16Input,
+                destinationSize: (expectedUtf8TranscodingHex.Length) / 2 + 16,
+                replaceInvalidSequences: false,
+                isFinalChunk: false,
+                expectedOperationStatus: OperationStatus.InvalidData,
+                expectedNumCharsRead: expectedNumCharsConsumed,
+                expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex));
+        }
+
+        [Theory]
+        [InlineData("<LOW><HIGH>", REPLACEMENT_CHAR_UTF8)] // standalone low surr. and incomplete high surr.
+        [InlineData("<HIGH><HIGH>", REPLACEMENT_CHAR_UTF8)] // standalone high surr. and incomplete high surr.
+        [InlineData("<LOW><LOW>", REPLACEMENT_CHAR_UTF8 + REPLACEMENT_CHAR_UTF8)] // standalone low surr. and incomplete low surr.
+        [InlineData("A<LOW>B<LOW>C<HIGH>D", "41" + REPLACEMENT_CHAR_UTF8 + "42" + REPLACEMENT_CHAR_UTF8 + "43" + REPLACEMENT_CHAR_UTF8 + "44")] // standalone low, low, high surrounded by other data
+        public void ToBytes_WithReplacements(string utf16Input, string expectedUtf8TranscodingHex)
+        {
+            // xUnit can't handle ill-formed strings in [InlineData], so we replace here.
+
+            utf16Input = utf16Input.Replace("<HIGH>", "\uD800").Replace("<LOW>", "\uDFFF");
+
+            bool isFinalCharHighSurrogate = char.IsHighSurrogate(utf16Input.Last());
+
+            ToBytes_Test_Core(
+                utf16Input: utf16Input,
+                destinationSize: expectedUtf8TranscodingHex.Length / 2,
+                replaceInvalidSequences: true,
+                isFinalChunk: false,
+                expectedOperationStatus: (isFinalCharHighSurrogate) ? OperationStatus.NeedMoreData : OperationStatus.Done,
+                expectedNumCharsRead: (isFinalCharHighSurrogate) ? (utf16Input.Length - 1) : utf16Input.Length,
+                expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex));
+
+            if (isFinalCharHighSurrogate)
+            {
+                // Also test with isFinalChunk = true
+                ToBytes_Test_Core(
+                    utf16Input: utf16Input,
+                    destinationSize: expectedUtf8TranscodingHex.Length / 2 + Rune.ReplacementChar.Utf8SequenceLength /* for replacement char */,
+                    replaceInvalidSequences: true,
+                    isFinalChunk: true,
+                    expectedOperationStatus: OperationStatus.Done,
+                    expectedNumCharsRead: utf16Input.Length,
+                    expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex + REPLACEMENT_CHAR_UTF8));
+            }
+        }
+
+        [Theory]
+        [InlineData(E_ACUTE_UTF16 + "<LOW>", true, 1, OperationStatus.DestinationTooSmall, E_ACUTE_UTF8)] // not enough output buffer to hold U+FFFD
+        [InlineData(E_ACUTE_UTF16 + "<LOW>", true, 2, OperationStatus.Done, E_ACUTE_UTF8 + REPLACEMENT_CHAR_UTF8)] // replace standalone low surr. at end
+        [InlineData(E_ACUTE_UTF16 + "<HIGH>", true, 1, OperationStatus.DestinationTooSmall, E_ACUTE_UTF8)] // not enough output buffer to hold U+FFFD
+        [InlineData(E_ACUTE_UTF16 + "<HIGH>", true, 2, OperationStatus.Done, E_ACUTE_UTF8 + REPLACEMENT_CHAR_UTF8)] // replace standalone high surr. at end
+        [InlineData(E_ACUTE_UTF16 + "<HIGH>", false, 1, OperationStatus.NeedMoreData, E_ACUTE_UTF8)] // don't replace standalone high surr. at end
+        [InlineData(E_ACUTE_UTF16 + "<HIGH>" + X_UTF16, true, 2, OperationStatus.DestinationTooSmall, E_ACUTE_UTF8 + REPLACEMENT_CHAR_UTF8)] // not enough output buffer to hold 'X'
+        [InlineData(E_ACUTE_UTF16 + "<HIGH>" + X_UTF16, false, 2, OperationStatus.DestinationTooSmall, E_ACUTE_UTF8 + REPLACEMENT_CHAR_UTF8)] // not enough output buffer to hold 'X'
+        [InlineData(E_ACUTE_UTF16 + "<HIGH>" + X_UTF16, true, 3, OperationStatus.Done, E_ACUTE_UTF8 + REPLACEMENT_CHAR_UTF8 + X_UTF8)] // replacement followed by 'X'
+        [InlineData(E_ACUTE_UTF16 + "<HIGH>" + X_UTF16, false, 3, OperationStatus.Done, E_ACUTE_UTF8 + REPLACEMENT_CHAR_UTF8 + X_UTF8)] // replacement followed by 'X'
+        public void ToBytes_WithReplacements_AndCustomBufferSizes(string utf16Input, bool isFinalChunk, int expectedNumCharsConsumed, OperationStatus expectedOperationStatus, string expectedUtf8TranscodingHex)
+        {
+            // xUnit can't handle ill-formed strings in [InlineData], so we replace here.
+
+            utf16Input = utf16Input.Replace("<HIGH>", "\uD800").Replace("<LOW>", "\uDFFF");
+
+            ToBytes_Test_Core(
+                utf16Input: utf16Input,
+                destinationSize: expectedUtf8TranscodingHex.Length / 2,
+                replaceInvalidSequences: true,
+                isFinalChunk: isFinalChunk,
+                expectedOperationStatus: expectedOperationStatus,
+                expectedNumCharsRead: expectedNumCharsConsumed,
+                expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex));
+        }
+
+        [Fact]
+        public void ToBytes_AllPossibleScalarValues()
+        {
+            ToBytes_Test_Core(
+                utf16Input: s_allScalarsAsUtf16.Span,
+                destinationSize: s_allScalarsAsUtf8.Length,
+                replaceInvalidSequences: false,
+                isFinalChunk: false,
+                expectedOperationStatus: OperationStatus.Done,
+                expectedNumCharsRead: s_allScalarsAsUtf16.Length,
+                expectedUtf8Transcoding: s_allScalarsAsUtf8.Span);
+        }
+
+        private static void ToBytes_Test_Core(ReadOnlySpan<char> utf16Input, int destinationSize, bool replaceInvalidSequences, bool isFinalChunk, OperationStatus expectedOperationStatus, int expectedNumCharsRead, ReadOnlySpan<byte> expectedUtf8Transcoding)
+        {
+            // Arrange
+
+            using (BoundedMemory<char> boundedSource = BoundedMemory.AllocateFromExistingData(utf16Input))
+            using (BoundedMemory<byte> boundedDestination = BoundedMemory.Allocate<byte>(destinationSize))
+            {
+                boundedSource.MakeReadonly();
+
+                // Act
+
+                OperationStatus actualOperationStatus = TextEncodings.Utf16.ToUtf8(boundedSource.Span, boundedDestination.Span, out int actualNumCharsRead, out int actualNumBytesWritten, replaceInvalidSequences, isFinalChunk);
+
+                // Assert
+
+                Assert.Equal(expectedOperationStatus, actualOperationStatus);
+                Assert.Equal(expectedNumCharsRead, actualNumCharsRead);
+                Assert.Equal(expectedUtf8Transcoding.Length, actualNumBytesWritten);
+                Assert.Equal(expectedUtf8Transcoding.ToArray(), boundedDestination.Span.Slice(0, actualNumBytesWritten).ToArray());
+            }
+        }
+
+        [Theory]
+        [InlineData("80", 0, "")] // sequence cannot begin with continuation character
+        [InlineData("8182", 0, "")] // sequence cannot begin with continuation character
+        [InlineData("838485", 0, "")] // sequence cannot begin with continuation character
+        [InlineData(X_UTF8 + "80", 1, X_UTF16)] // sequence cannot begin with continuation character
+        [InlineData(X_UTF8 + "8182", 1, X_UTF16)] // sequence cannot begin with continuation character
+        [InlineData("C0", 0, "")] // [ C0 ] is always invalid
+        [InlineData("C080", 0, "")] // [ C0 ] is always invalid
+        [InlineData("C08081", 0, "")] // [ C0 ] is always invalid
+        [InlineData(X_UTF8 + "C1", 1, X_UTF16)] // [ C1 ] is always invalid
+        [InlineData(X_UTF8 + "C180", 1, X_UTF16)] // [ C1 ] is always invalid
+        [InlineData(X_UTF8 + "C27F", 1, X_UTF16)] // [ C2 ] is improperly terminated
+        [InlineData("E2827F", 0, "")] // [ E2 82 ] is improperly terminated
+        [InlineData("E09F80", 0, "")] // [ E0 9F ... ] is overlong
+        [InlineData("E0C080", 0, "")] // [ E0 ] is improperly terminated
+        [InlineData("ED7F80", 0, "")] // [ ED ] is improperly terminated
+        [InlineData("EDA080", 0, "")] // [ ED A0 ... ] is surrogate
+        public void ToChars_WithSmallInvalidBuffers(string utf8HexInput, int expectedNumBytesConsumed, string expectedUtf16Transcoding)
+        {
+            // These test cases are for the "slow processing" code path at the end of TranscodeToUtf16,
+            // so inputs should be less than 4 bytes.
+
+            Assert.InRange(utf8HexInput.Length, 0, 6);
+
+            ToChars_Test_Core(
+              utf8Input: DecodeHex(utf8HexInput),
+              destinationSize: expectedUtf16Transcoding.Length,
+              replaceInvalidSequences: false,
+              isFinalChunk: false,
+              expectedOperationStatus: OperationStatus.InvalidData,
+              expectedNumBytesRead: expectedNumBytesConsumed,
+              expectedUtf16Transcoding: expectedUtf16Transcoding);
+
+            // Now try the tests again with a larger buffer.
+            // This ensures that running out of destination space wasn't the reason we failed.
+
+            ToChars_Test_Core(
+              utf8Input: DecodeHex(utf8HexInput),
+              destinationSize: expectedUtf16Transcoding.Length + 16,
+              replaceInvalidSequences: false,
+              isFinalChunk: false,
+              expectedOperationStatus: OperationStatus.InvalidData,
+              expectedNumBytesRead: expectedNumBytesConsumed,
+              expectedUtf16Transcoding: expectedUtf16Transcoding);
+        }
+
+        [Theory]
+        [InlineData("C2", 0, "")] // [ C2 ] is an incomplete sequence
+        [InlineData("E282", 0, "")] // [ E2 82 ] is an incomplete sequence
+        [InlineData(X_UTF8 + "C2", 1, X_UTF16)] // [ C2 ] is an incomplete sequence
+        [InlineData(X_UTF8 + "E0", 1, X_UTF16)] // [ E0 ] is an incomplete sequence
+        [InlineData(X_UTF8 + "E0BF", 1, X_UTF16)] // [ E0 BF ] is an incomplete sequence
+        [InlineData(X_UTF8 + "F0", 1, X_UTF16)] // [ F0 ] is an incomplete sequence
+        [InlineData(X_UTF8 + "F0BF", 1, X_UTF16)] // [ F0 BF ] is an incomplete sequence
+        [InlineData(X_UTF8 + "F0BFA0", 1, X_UTF16)] // [ F0 BF A0 ] is an incomplete sequence
+        [InlineData(E_ACUTE_UTF8 + "C2", 2, E_ACUTE_UTF16)] // [ C2 ] is an incomplete sequence
+        [InlineData(E_ACUTE_UTF8 + "E0", 2, E_ACUTE_UTF16)] // [ E0 ] is an incomplete sequence
+        [InlineData(E_ACUTE_UTF8 + "F0", 2, E_ACUTE_UTF16)] // [ F0 ] is an incomplete sequence
+        [InlineData(E_ACUTE_UTF8 + "E0BF", 2, E_ACUTE_UTF16)] // [ E0 BF ] is an incomplete sequence
+        [InlineData(E_ACUTE_UTF8 + "F0BF", 2, E_ACUTE_UTF16)] // [ F0 BF ] is an incomplete sequence
+        [InlineData(EURO_SYMBOL_UTF8 + "C2", 3, EURO_SYMBOL_UTF16)] // [ C2 ] is an incomplete sequence
+        [InlineData(EURO_SYMBOL_UTF8 + "E0", 3, EURO_SYMBOL_UTF16)] // [ E0 ] is an incomplete sequence
+        [InlineData(EURO_SYMBOL_UTF8 + "F0", 3, EURO_SYMBOL_UTF16)] // [ F0 ] is an incomplete sequence
+        public void ToChars_WithVariousIncompleteBuffers(string utf8HexInput, int expectedNumBytesConsumed, string expectedUtf16Transcoding)
+        {
+            // These test cases are for the "slow processing" code path at the end of TranscodeToUtf16,
+            // so inputs should be less than 4 bytes.
+
+            ToChars_Test_Core(
+              utf8Input: DecodeHex(utf8HexInput),
+              destinationSize: expectedUtf16Transcoding.Length,
+              replaceInvalidSequences: false,
+              isFinalChunk: false,
+              expectedOperationStatus: OperationStatus.NeedMoreData,
+              expectedNumBytesRead: expectedNumBytesConsumed,
+              expectedUtf16Transcoding: expectedUtf16Transcoding);
+
+            // Now try the tests again with a larger buffer.
+            // This ensures that running out of destination space wasn't the reason we failed.
+
+            ToChars_Test_Core(
+             utf8Input: DecodeHex(utf8HexInput),
+             destinationSize: expectedUtf16Transcoding.Length + 16,
+             replaceInvalidSequences: false,
+             isFinalChunk: false,
+             expectedOperationStatus: OperationStatus.NeedMoreData,
+             expectedNumBytesRead: expectedNumBytesConsumed,
+             expectedUtf16Transcoding: expectedUtf16Transcoding);
+        }
+
+        [Theory]
+        /* SMALL VALID BUFFERS - tests drain loop at end of method */
+        [InlineData("")] // empty string is OK
+        [InlineData("X")]
+        [InlineData("XY")]
+        [InlineData("XYZ")]
+        [InlineData(E_ACUTE_UTF16)]
+        [InlineData(X_UTF16 + E_ACUTE_UTF16)]
+        [InlineData(E_ACUTE_UTF16 + X_UTF16)]
+        [InlineData(EURO_SYMBOL_UTF16)]
+        /* LARGE VALID BUFFERS - test main loop at beginning of method */
+        [InlineData(E_ACUTE_UTF16 + "ABCD" + "0123456789:;<=>?")] // Loop unrolling at end of buffer
+        [InlineData(E_ACUTE_UTF16 + "ABCD" + "0123456789:;<=>?" + "01234567" + E_ACUTE_UTF16 + "89:;<=>?")] // Loop unrolling interrupted by non-ASCII
+        [InlineData("ABC" + E_ACUTE_UTF16 + "0123")] // 3 ASCII bytes followed by non-ASCII
+        [InlineData("AB" + E_ACUTE_UTF16 + "0123")] // 2 ASCII bytes followed by non-ASCII
+        [InlineData("A" + E_ACUTE_UTF16 + "0123")] // 1 ASCII byte followed by non-ASCII
+        [InlineData(E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16)] // 4x 2-byte sequences, exercises optimization code path in 2-byte sequence processing
+        [InlineData(E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + "PQ")] // 3x 2-byte sequences + 2 ASCII bytes, exercises optimization code path in 2-byte sequence processing
+        [InlineData(E_ACUTE_UTF16 + "PQ")] // single 2-byte sequence + 2 trailing ASCII bytes, exercises draining logic in 2-byte sequence processing
+        [InlineData(E_ACUTE_UTF16 + "P" + E_ACUTE_UTF16 + "0@P")] // single 2-byte sequences + 1 trailing ASCII byte + 2-byte sequence, exercises draining logic in 2-byte sequence processing
+        [InlineData(EURO_SYMBOL_UTF16 + "@")] // single 3-byte sequence + 1 trailing ASCII byte, exercises draining logic in 3-byte sequence processing
+        [InlineData(EURO_SYMBOL_UTF16 + "@P`")] // single 3-byte sequence + 3 trailing ASCII byte, exercises draining logic and "running out of data" logic in 3-byte sequence processing
+        [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16)] // 3x 3-byte sequences, exercises "stay within 3-byte loop" logic in 3-byte sequence processing
+        [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16)] // 4x 3-byte sequences, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing
+        [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + E_ACUTE_UTF16)] // 3x 3-byte sequences + single 2-byte sequence, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing
+        [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16)] // 2x 3-byte sequences + 4x 2-byte sequences, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing
+        [InlineData(GRINNING_FACE_UTF16 + GRINNING_FACE_UTF16)] // 2x 4-byte sequences, exercises 4-byte sequence processing
+        [InlineData(GRINNING_FACE_UTF16 + "@AB")] // single 4-byte sequence + 3 ASCII bytes, exercises 4-byte sequence processing and draining logic
+        [InlineData(WOMAN_CARTWHEELING_MEDSKIN_UTF16)] // exercises switching between multiple sequence lengths
+        public void ToChars_ValidBuffers(string utf16Input)
+        {
+            // We're going to run the tests with destination buffer lengths ranging from 0 all the way
+            // to buffers large enough to hold the full output. This allows us to test logic that
+            // detects whether we're about to overrun our destination buffer and instead returns DestinationTooSmall.
+
+            Rune[] enumeratedScalars = utf16Input.EnumerateRunes().ToArray();
+
+            // Convert entire input to UTF-8 using our unit test reference logic.
+
+            byte[] utf8Input = enumeratedScalars.SelectMany(ToUtf8).ToArray();
+
+            // 0-length buffer test
+            ToChars_Test_Core(
+                utf8Input: utf8Input,
+                destinationSize: 0,
+                replaceInvalidSequences: false,
+                isFinalChunk: false,
+                expectedOperationStatus: (utf8Input.Length == 0) ? OperationStatus.Done : OperationStatus.DestinationTooSmall,
+                expectedNumBytesRead: 0,
+                expectedUtf16Transcoding: ReadOnlySpan<char>.Empty);
+
+            int expectedNumBytesConsumed = 0;
+            char[] concatenatedUtf16 = Array.Empty<char>();
+
+            for (int i = 0; i < enumeratedScalars.Length; i++)
+            {
+                Rune thisScalar = enumeratedScalars[i];
+
+                // if this is an astral scalar value, quickly test a buffer that's not large enough to contain the entire UTF-16 encoding
+
+                if (!thisScalar.IsBmp)
+                {
+                    ToChars_Test_Core(
+                        utf8Input: utf8Input,
+                        destinationSize: concatenatedUtf16.Length + 1,
+                        replaceInvalidSequences: false,
+                        isFinalChunk: false,
+                        expectedOperationStatus: OperationStatus.DestinationTooSmall,
+                        expectedNumBytesRead: expectedNumBytesConsumed,
+                        expectedUtf16Transcoding: concatenatedUtf16);
+                }
+
+                // now provide a destination buffer large enough to hold the next full scalar encoding
+
+                expectedNumBytesConsumed += thisScalar.Utf8SequenceLength;
+                concatenatedUtf16 = concatenatedUtf16.Concat(ToUtf16(thisScalar)).ToArray();
+
+                ToChars_Test_Core(
+                    utf8Input: utf8Input,
+                    destinationSize: concatenatedUtf16.Length,
+                    replaceInvalidSequences: false,
+                    isFinalChunk: false,
+                    expectedOperationStatus: (i == enumeratedScalars.Length - 1) ? OperationStatus.Done : OperationStatus.DestinationTooSmall,
+                    expectedNumBytesRead: expectedNumBytesConsumed,
+                    expectedUtf16Transcoding: concatenatedUtf16);
+            }
+
+            // now throw lots of ASCII data at the beginning so that we exercise the vectorized code paths
+
+            utf16Input = new string('x', 64) + utf16Input;
+            utf8Input = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray();
+
+            ToChars_Test_Core(
+                utf8Input: utf8Input,
+                destinationSize: utf16Input.Length,
+                replaceInvalidSequences: false,
+                isFinalChunk: true,
+                expectedOperationStatus: OperationStatus.Done,
+                expectedNumBytesRead: utf8Input.Length,
+                expectedUtf16Transcoding: utf16Input);
+
+            // now throw some non-ASCII data at the beginning so that we *don't* exercise the vectorized code paths
+
+            utf16Input = WOMAN_CARTWHEELING_MEDSKIN_UTF16 + utf16Input[64..];
+            utf8Input = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray();
+
+            ToChars_Test_Core(
+                utf8Input: utf8Input,
+                destinationSize: utf16Input.Length,
+                replaceInvalidSequences: false,
+                isFinalChunk: true,
+                expectedOperationStatus: OperationStatus.Done,
+                expectedNumBytesRead: utf8Input.Length,
+                expectedUtf16Transcoding: utf16Input);
+        }
+
+        [Theory]
+        [InlineData("3031" + "80" + "202122232425", 2, "01")] // Continuation character at start of sequence should match no bitmask
+        [InlineData("3031" + "C080" + "2021222324", 2, "01")] // Overlong 2-byte sequence at start of DWORD
+        [InlineData("3031" + "C180" + "2021222324", 2, "01")] // Overlong 2-byte sequence at start of DWORD
+        [InlineData("C280" + "C180", 2, "\u0080")] // Overlong 2-byte sequence at end of DWORD
+        [InlineData("C27F" + "C280", 0, "")] // Improperly terminated 2-byte sequence at start of DWORD
+        [InlineData("C2C0" + "C280", 0, "")] // Improperly terminated 2-byte sequence at start of DWORD
+        [InlineData("C280" + "C27F", 2, "\u0080")] // Improperly terminated 2-byte sequence at end of DWORD
+        [InlineData("C280" + "C2C0", 2, "\u0080")] // Improperly terminated 2-byte sequence at end of DWORD
+        [InlineData("C280" + "C280" + "80203040", 4, "\u0080\u0080")] // Continuation character at start of sequence, within "stay in 2-byte processing" optimization
+        [InlineData("C280" + "C280" + "C180" + "C280", 4, "\u0080\u0080")] // Overlong 2-byte sequence at start of DWORD, within "stay in 2-byte processing" optimization
+        [InlineData("C280" + "C280" + "C280" + "C180", 6, "\u0080\u0080\u0080")] // Overlong 2-byte sequence at end of DWORD, within "stay in 2-byte processing" optimization
+        [InlineData("3031" + "E09F80" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Overlong 3-byte sequence at start of DWORD
+        [InlineData("3031" + "E07F80" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Improperly terminated 3-byte sequence at start of DWORD
+        [InlineData("3031" + "E0C080" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Improperly terminated 3-byte sequence at start of DWORD
+        [InlineData("3031" + "E17F80" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Improperly terminated 3-byte sequence at start of DWORD
+        [InlineData("3031" + "E1C080" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Improperly terminated 3-byte sequence at start of DWORD
+        [InlineData("3031" + "EDA080" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Surrogate 3-byte sequence at start of DWORD
+        [InlineData("3031" + "E69C88" + "E59B" + "E69C88", 5, "01\u6708")] // Incomplete 3-byte sequence surrounded by valid 3-byte sequences
+        [InlineData("3031" + "F5808080", 2, "01")] // [ F5 ] is always invalid
+        [InlineData("3031" + "F6808080", 2, "01")] // [ F6 ] is always invalid
+        [InlineData("3031" + "F7808080", 2, "01")] // [ F7 ] is always invalid
+        [InlineData("3031" + "F8808080", 2, "01")] // [ F8 ] is always invalid
+        [InlineData("3031" + "F9808080", 2, "01")] // [ F9 ] is always invalid
+        [InlineData("3031" + "FA808080", 2, "01")] // [ FA ] is always invalid
+        [InlineData("3031" + "FB808080", 2, "01")] // [ FB ] is always invalid
+        [InlineData("3031" + "FC808080", 2, "01")] // [ FC ] is always invalid
+        [InlineData("3031" + "FD808080", 2, "01")] // [ FD ] is always invalid
+        [InlineData("3031" + "FE808080", 2, "01")] // [ FE ] is always invalid
+        [InlineData("3031" + "FF808080", 2, "01")] // [ FF ] is always invalid
+        public void ToChars_WithLargeInvalidBuffers(string utf8HexInput, int expectedNumBytesConsumed, string expectedUtf16Transcoding)
+        {
+            // These test cases are for the "fast processing" code which is the main loop of TranscodeToUtf16,
+            // so inputs should be less >= 4 bytes.
+
+            Assert.True(utf8HexInput.Length >= 8);
+
+            ToChars_Test_Core(
+                utf8Input: DecodeHex(utf8HexInput),
+                destinationSize: expectedUtf16Transcoding.Length,
+                replaceInvalidSequences: false,
+                isFinalChunk: false,
+                expectedOperationStatus: OperationStatus.InvalidData,
+                expectedNumBytesRead: expectedNumBytesConsumed,
+                expectedUtf16Transcoding: expectedUtf16Transcoding);
+
+            // Now try the tests again with a larger buffer.
+            // This ensures that running out of destination space wasn't the reason we failed.
+
+            ToChars_Test_Core(
+                utf8Input: DecodeHex(utf8HexInput),
+                destinationSize: expectedUtf16Transcoding.Length + 16,
+                replaceInvalidSequences: false,
+                isFinalChunk: false,
+                expectedOperationStatus: OperationStatus.InvalidData,
+                expectedNumBytesRead: expectedNumBytesConsumed,
+                expectedUtf16Transcoding: expectedUtf16Transcoding);
+        }
+
+        [Theory]
+        [InlineData(X_UTF8 + "80" + X_UTF8, X_UTF16 + REPLACEMENT_CHAR_UTF16 + X_UTF16)] // stray continuation byte [ 80 ]
+        [InlineData(X_UTF8 + "FF" + X_UTF8, X_UTF16 + REPLACEMENT_CHAR_UTF16 + X_UTF16)] // invalid UTF-8 byte [ FF ]
+        [InlineData(X_UTF8 + "C2" + X_UTF8, X_UTF16 + REPLACEMENT_CHAR_UTF16 + X_UTF16)] // 2-byte sequence starter [ C2 ] not followed by continuation byte
+        [InlineData(X_UTF8 + "C1C180" + X_UTF8, X_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + X_UTF16)] // [ C1 80 ] is overlong but consists of two maximal invalid subsequences, each of length 1 byte
+        [InlineData(X_UTF8 + E_ACUTE_UTF8 + "E08080", X_UTF16 + E_ACUTE_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16)] // [ E0 80 ] is overlong 2-byte sequence (1 byte maximal invalid subsequence), and following [ 80 ] is stray continuation byte
+        [InlineData(GRINNING_FACE_UTF8 + "F08F8080" + GRINNING_FACE_UTF8, GRINNING_FACE_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + GRINNING_FACE_UTF16)] // [ F0 8F ] is overlong 4-byte sequence (1 byte maximal invalid subsequence), and following [ 80 ] instances are stray continuation bytes
+        [InlineData(GRINNING_FACE_UTF8 + "F4908080" + GRINNING_FACE_UTF8, GRINNING_FACE_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + GRINNING_FACE_UTF16)] // [ F4 90 ] is out-of-range 4-byte sequence (1 byte maximal invalid subsequence), and following [ 80 ] instances are stray continuation bytes
+        [InlineData(E_ACUTE_UTF8 + "EDA0" + X_UTF8, E_ACUTE_UTF16 + REPLACEMENT_CHAR_UTF16 + REPLACEMENT_CHAR_UTF16 + X_UTF16)] // [ ED A0 ] is encoding of UTF-16 surrogate code point, so consists of two maximal invalid subsequences, each of length 1 byte
+        [InlineData(E_ACUTE_UTF8 + "ED80" + X_UTF8, E_ACUTE_UTF16 + REPLACEMENT_CHAR_UTF16 + X_UTF16)] // [ ED 80 ] is incomplete 3-byte sequence, so is 2-byte maximal invalid subsequence
+        [InlineData(E_ACUTE_UTF8 + "F380" + X_UTF8, E_ACUTE_UTF16 + REPLACEMENT_CHAR_UTF16 + X_UTF16)] // [ F3 80 ] is incomplete 4-byte sequence, so is 2-byte maximal invalid subsequence
+        [InlineData(E_ACUTE_UTF8 + "F38080" + X_UTF8, E_ACUTE_UTF16 + REPLACEMENT_CHAR_UTF16 + X_UTF16)] // [ F3 80 80 ] is incomplete 4-byte sequence, so is 3-byte maximal invalid subsequence
+        public void ToChars_WithReplacement(string utf8HexInput, string expectedUtf16Transcoding)
+        {
+            // First run the test with isFinalBlock = false,
+            // both with and without some bytes of incomplete trailing data.
+
+            ToChars_Test_Core(
+                utf8Input: DecodeHex(utf8HexInput),
+                destinationSize: expectedUtf16Transcoding.Length,
+                replaceInvalidSequences: true,
+                isFinalChunk: false,
+                expectedOperationStatus: OperationStatus.Done,
+                expectedNumBytesRead: utf8HexInput.Length / 2,
+                expectedUtf16Transcoding: expectedUtf16Transcoding);
+
+            ToChars_Test_Core(
+                utf8Input: DecodeHex(utf8HexInput + "E0BF" /* trailing data */),
+                destinationSize: expectedUtf16Transcoding.Length,
+                replaceInvalidSequences: true,
+                isFinalChunk: false,
+                expectedOperationStatus: OperationStatus.NeedMoreData,
+                expectedNumBytesRead: utf8HexInput.Length / 2,
+                expectedUtf16Transcoding: expectedUtf16Transcoding);
+
+            // Then run the test with isFinalBlock = true, with incomplete trailing data.
+
+            ToChars_Test_Core(
+                utf8Input: DecodeHex(utf8HexInput + "E0BF" /* trailing data */),
+                destinationSize: expectedUtf16Transcoding.Length,
+                replaceInvalidSequences: true,
+                isFinalChunk: true,
+                expectedOperationStatus: OperationStatus.DestinationTooSmall,
+                expectedNumBytesRead: utf8HexInput.Length / 2,
+                expectedUtf16Transcoding: expectedUtf16Transcoding);
+
+            ToChars_Test_Core(
+                 utf8Input: DecodeHex(utf8HexInput + "E0BF" /* trailing data */),
+                 destinationSize: expectedUtf16Transcoding.Length + 1, // allow room for U+FFFD
+                 replaceInvalidSequences: true,
+                 isFinalChunk: true,
+                 expectedOperationStatus: OperationStatus.Done,
+                 expectedNumBytesRead: utf8HexInput.Length / 2 + 2,
+                 expectedUtf16Transcoding: expectedUtf16Transcoding + REPLACEMENT_CHAR_UTF16);
+        }
+
+        [Fact]
+        public void ToChars_AllPossibleScalarValues()
+        {
+            ToChars_Test_Core(
+                utf8Input: s_allScalarsAsUtf8.Span,
+                destinationSize: s_allScalarsAsUtf16.Length,
+                replaceInvalidSequences: false,
+                isFinalChunk: false,
+                expectedOperationStatus: OperationStatus.Done,
+                expectedNumBytesRead: s_allScalarsAsUtf8.Length,
+                expectedUtf16Transcoding: s_allScalarsAsUtf16.Span);
+        }
+
+        private static void ToChars_Test_Core(ReadOnlySpan<byte> utf8Input, int destinationSize, bool replaceInvalidSequences, bool isFinalChunk, OperationStatus expectedOperationStatus, int expectedNumBytesRead, ReadOnlySpan<char> expectedUtf16Transcoding)
+        {
+            // Arrange
+
+            using (BoundedMemory<byte> boundedSource = BoundedMemory.AllocateFromExistingData(utf8Input))
+            using (BoundedMemory<char> boundedDestination = BoundedMemory.Allocate<char>(destinationSize))
+            {
+                boundedSource.MakeReadonly();
+
+                // Act
+
+                OperationStatus actualOperationStatus = TextEncodings.Utf8.ToUtf16(boundedSource.Span, boundedDestination.Span, out int actualNumBytesRead, out int actualNumCharsWritten, replaceInvalidSequences, isFinalChunk);
+
+                // Assert
+
+                Assert.Equal(expectedOperationStatus, actualOperationStatus);
+                Assert.Equal(expectedNumBytesRead, actualNumBytesRead);
+                Assert.Equal(expectedUtf16Transcoding.Length, actualNumCharsWritten);
+                Assert.Equal(expectedUtf16Transcoding.ToString(), boundedDestination.Span.Slice(0, actualNumCharsWritten).ToString());
+            }
+        }
+    }
+}
+#endif
diff --git a/test/DotNetty.Common.Tests/Internal/CoreLib/Utf8UtilityTests.ValidateBytes.cs b/test/DotNetty.Common.Tests/Internal/CoreLib/Utf8UtilityTests.ValidateBytes.cs
new file mode 100644
index 000000000..3f85c6254
--- /dev/null
+++ b/test/DotNetty.Common.Tests/Internal/CoreLib/Utf8UtilityTests.ValidateBytes.cs
@@ -0,0 +1,396 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#if CORELIBTEST
+using System;
+using System.Buffers;
+using System.Linq;
+using System.Runtime.InteropServices;
+using DotNetty.Common.Internal;
+using Xunit;
+
+namespace DotNetty.Common.Tests.Internal.CoreLib
+{
+    public class Utf8UtilityTests
+    {
+        private const string X = "58"; // U+0058 LATIN CAPITAL LETTER X, 1 byte
+        private const string Y = "59"; // U+0058 LATIN CAPITAL LETTER Y, 1 byte
+        private const string Z = "5A"; // U+0058 LATIN CAPITAL LETTER Z, 1 byte
+        private const string E_ACUTE = "C3A9"; // U+00E9 LATIN SMALL LETTER E WITH ACUTE, 2 bytes
+        private const string EURO_SYMBOL = "E282AC"; // U+20AC EURO SIGN, 3 bytes
+        private const string GRINNING_FACE = "F09F9880"; // U+1F600 GRINNING FACE, 4 bytes
+
+        [Theory]
+        [InlineData("", 0, 0)] // empty string is OK
+        [InlineData(X, 1, 0)]
+        [InlineData(X + Y, 2, 0)]
+        [InlineData(X + Y + Z, 3, 0)]
+        [InlineData(E_ACUTE, 1, 0)]
+        [InlineData(X + E_ACUTE, 2, 0)]
+        [InlineData(E_ACUTE + X, 2, 0)]
+        [InlineData(EURO_SYMBOL, 1, 0)]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithSmallValidBuffers(string input, int expectedRuneCount, int expectedSurrogatePairCount)
+        {
+            // These test cases are for the "slow processing" code path at the end of GetIndexOfFirstInvalidUtf8Sequence,
+            // so inputs should be less than 4 bytes.
+
+            Assert.InRange(input.Length, 0, 6);
+
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(input, -1 /* expectedRetVal */, expectedRuneCount, expectedSurrogatePairCount);
+        }
+
+        [Theory]
+        [InlineData("80", 0, 0, 0)] // sequence cannot begin with continuation character
+        [InlineData("8182", 0, 0, 0)] // sequence cannot begin with continuation character
+        [InlineData("838485", 0, 0, 0)] // sequence cannot begin with continuation character
+        [InlineData(X + "80", 1, 1, 0)] // sequence cannot begin with continuation character
+        [InlineData(X + "8182", 1, 1, 0)] // sequence cannot begin with continuation character
+        [InlineData("C0", 0, 0, 0)] // [ C0 ] is always invalid
+        [InlineData("C080", 0, 0, 0)] // [ C0 ] is always invalid
+        [InlineData("C08081", 0, 0, 0)] // [ C0 ] is always invalid
+        [InlineData(X + "C1", 1, 1, 0)] // [ C1 ] is always invalid
+        [InlineData(X + "C180", 1, 1, 0)] // [ C1 ] is always invalid
+        [InlineData("C2", 0, 0, 0)] // [ C2 ] is improperly terminated
+        [InlineData(X + "C27F", 1, 1, 0)] // [ C2 ] is improperly terminated
+        [InlineData(X + "E282", 1, 1, 0)] // [ E2 82 ] is improperly terminated
+        [InlineData("E2827F", 0, 0, 0)] // [ E2 82 ] is improperly terminated
+        [InlineData("E09F80", 0, 0, 0)] // [ E0 9F ... ] is overlong
+        [InlineData("E0C080", 0, 0, 0)] // [ E0 ] is improperly terminated
+        [InlineData("ED7F80", 0, 0, 0)] // [ ED ] is improperly terminated
+        [InlineData("EDA080", 0, 0, 0)] // [ ED A0 ... ] is surrogate
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithSmallInvalidBuffers(string input, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount)
+        {
+            // These test cases are for the "slow processing" code path at the end of GetIndexOfFirstInvalidUtf8Sequence,
+            // so inputs should be less than 4 bytes.
+
+            Assert.InRange(input.Length, 0, 6);
+
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(input, expectedRetVal, expectedRuneCount, expectedSurrogatePairCount);
+        }
+
+        [Theory]
+        [InlineData(E_ACUTE + "21222324" + "303132333435363738393A3B3C3D3E3F", 21, 0)] // Loop unrolling at end of buffer
+        [InlineData(E_ACUTE + "21222324" + "303132333435363738393A3B3C3D3E3F" + "3031323334353637" + E_ACUTE + "38393A3B3C3D3E3F", 38, 0)] // Loop unrolling interrupted by non-ASCII
+        [InlineData("212223" + E_ACUTE + "30313233", 8, 0)] // 3 ASCII bytes followed by non-ASCII
+        [InlineData("2122" + E_ACUTE + "30313233", 7, 0)] // 2 ASCII bytes followed by non-ASCII
+        [InlineData("21" + E_ACUTE + "30313233", 6, 0)] // 1 ASCII byte followed by non-ASCII
+        [InlineData(E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE, 4, 0)] // 4x 2-byte sequences, exercises optimization code path in 2-byte sequence processing
+        [InlineData(E_ACUTE + E_ACUTE + E_ACUTE + "5051", 5, 0)] // 3x 2-byte sequences + 2 ASCII bytes, exercises optimization code path in 2-byte sequence processing
+        [InlineData(E_ACUTE + "5051", 3, 0)] // single 2-byte sequence + 2 trailing ASCII bytes, exercises draining logic in 2-byte sequence processing
+        [InlineData(E_ACUTE + "50" + E_ACUTE + "304050", 6, 0)] // single 2-byte sequences + 1 trailing ASCII byte + 2-byte sequence, exercises draining logic in 2-byte sequence processing
+        [InlineData(EURO_SYMBOL + "20", 2, 0)] // single 3-byte sequence + 1 trailing ASCII byte, exercises draining logic in 3-byte sequence processing
+        [InlineData(EURO_SYMBOL + "203040", 4, 0)] // single 3-byte sequence + 3 trailing ASCII byte, exercises draining logic and "running out of data" logic in 3-byte sequence processing
+        [InlineData(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL, 3, 0)] // 3x 3-byte sequences, exercises "stay within 3-byte loop" logic in 3-byte sequence processing
+        [InlineData(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL, 4, 0)] // 4x 3-byte sequences, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing
+        [InlineData(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + E_ACUTE, 4, 0)] // 3x 3-byte sequences + single 2-byte sequence, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing
+        [InlineData(EURO_SYMBOL + EURO_SYMBOL + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE, 6, 0)] // 2x 3-byte sequences + 4x 2-byte sequences, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing
+        [InlineData(GRINNING_FACE + GRINNING_FACE, 2, 2)] // 2x 4-byte sequences, exercises 4-byte sequence processing
+        [InlineData(GRINNING_FACE + "303132", 4, 1)] // single 4-byte sequence + 3 ASCII bytes, exercises 4-byte sequence processing and draining logic
+        [InlineData("F09FA4B8" + "F09F8FBD" + "E2808D" + "E29980" + "EFB88F", 5, 2)] // U+1F938 U+1F3FD U+200D U+2640 U+FE0F WOMAN CARTWHEELING: MEDIUM SKIN TONE, exercising switching between multiple sequence lengths
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithLargeValidBuffers(string input, int expectedRuneCount, int expectedSurrogatePairCount)
+        {
+            // These test cases are for the "fast processing" code which is the main loop of GetIndexOfFirstInvalidUtf8Sequence,
+            // so inputs should be less >= 4 bytes.
+
+            Assert.True(input.Length >= 8);
+
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(input, -1 /* expectedRetVal */, expectedRuneCount, expectedSurrogatePairCount);
+        }
+
+        [Theory]
+        [InlineData("3031" + "80" + "202122232425", 2, 2, 0)] // Continuation character at start of sequence should match no bitmask
+        [InlineData("3031" + "C080" + "2021222324", 2, 2, 0)] // Overlong 2-byte sequence at start of DWORD
+        [InlineData("3031" + "C180" + "2021222324", 2, 2, 0)] // Overlong 2-byte sequence at start of DWORD
+        [InlineData("C280" + "C180", 2, 1, 0)] // Overlong 2-byte sequence at end of DWORD
+        [InlineData("C27F" + "C280", 0, 0, 0)] // Improperly terminated 2-byte sequence at start of DWORD
+        [InlineData("C2C0" + "C280", 0, 0, 0)] // Improperly terminated 2-byte sequence at start of DWORD
+        [InlineData("C280" + "C27F", 2, 1, 0)] // Improperly terminated 2-byte sequence at end of DWORD
+        [InlineData("C280" + "C2C0", 2, 1, 0)] // Improperly terminated 2-byte sequence at end of DWORD
+        [InlineData("C280" + "C280" + "80203040", 4, 2, 0)] // Continuation character at start of sequence, within "stay in 2-byte processing" optimization
+        [InlineData("C280" + "C280" + "C180" + "C280", 4, 2, 0)] // Overlong 2-byte sequence at start of DWORD, within "stay in 2-byte processing" optimization
+        [InlineData("C280" + "C280" + "C280" + "C180", 6, 3, 0)] // Overlong 2-byte sequence at end of DWORD, within "stay in 2-byte processing" optimization
+        [InlineData("3031" + "E09F80" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Overlong 3-byte sequence at start of DWORD
+        [InlineData("3031" + "E07F80" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Improperly terminated 3-byte sequence at start of DWORD
+        [InlineData("3031" + "E0C080" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Improperly terminated 3-byte sequence at start of DWORD
+        [InlineData("3031" + "E17F80" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Improperly terminated 3-byte sequence at start of DWORD
+        [InlineData("3031" + "E1C080" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Improperly terminated 3-byte sequence at start of DWORD
+        [InlineData("3031" + "EDA080" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Surrogate 3-byte sequence at start of DWORD
+        [InlineData("3031" + "E69C88" + "E59B" + "E69C88", 5, 3, 0)] // Incomplete 3-byte sequence surrounded by valid 3-byte sequences
+        [InlineData("E78B80" + "80", 3, 1, 0)] // Valid 3-byte sequence followed by standalone continuation byte
+        [InlineData("3031" + "F5808080", 2, 2, 0)] // [ F5 ] is always invalid
+        [InlineData("3031" + "F6808080", 2, 2, 0)] // [ F6 ] is always invalid
+        [InlineData("3031" + "F7808080", 2, 2, 0)] // [ F7 ] is always invalid
+        [InlineData("3031" + "F8808080", 2, 2, 0)] // [ F8 ] is always invalid
+        [InlineData("3031" + "F9808080", 2, 2, 0)] // [ F9 ] is always invalid
+        [InlineData("3031" + "FA808080", 2, 2, 0)] // [ FA ] is always invalid
+        [InlineData("3031" + "FB808080", 2, 2, 0)] // [ FB ] is always invalid
+        [InlineData("3031" + "FC808080", 2, 2, 0)] // [ FC ] is always invalid
+        [InlineData("3031" + "FD808080", 2, 2, 0)] // [ FD ] is always invalid
+        [InlineData("3031" + "FE808080", 2, 2, 0)] // [ FE ] is always invalid
+        [InlineData("3031" + "FF808080", 2, 2, 0)] // [ FF ] is always invalid
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithLargeInvalidBuffers(string input, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount)
+        {
+            // These test cases are for the "fast processing" code which is the main loop of GetIndexOfFirstInvalidUtf8Sequence,
+            // so inputs should be less >= 4 bytes.
+
+            Assert.True(input.Length >= 8);
+
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(input, expectedRetVal, expectedRuneCount, expectedSurrogatePairCount);
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithOverlongTwoByteSequences_ReturnsInvalid()
+        {
+            // [ C0 ] is never a valid byte, indicates overlong 2-byte sequence
+            // We'll test that [ C0 ] [ 00..FF ] is treated as invalid
+
+            for (int i = 0; i < 256; i++)
+            {
+                AssertIsInvalidTwoByteSequence(new byte[] { 0xC0, (byte)i });
+            }
+
+            // [ C1 ] is never a valid byte, indicates overlong 2-byte sequence
+            // We'll test that [ C1 ] [ 00..FF ] is treated as invalid
+
+            for (int i = 0; i < 256; i++)
+            {
+                AssertIsInvalidTwoByteSequence(new byte[] { 0xC1, (byte)i });
+            }
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithImproperlyTerminatedTwoByteSequences_ReturnsInvalid()
+        {
+            // Test [ C2..DF ] [ 00..7F ] and [ C2..DF ] [ C0..FF ]
+
+            for (int i = 0xC2; i < 0xDF; i++)
+            {
+                for (int j = 0; j < 0x80; j++)
+                {
+                    AssertIsInvalidTwoByteSequence(new byte[] { (byte)i, (byte)j });
+                }
+                for (int j = 0xC0; j < 0x100; j++)
+                {
+                    AssertIsInvalidTwoByteSequence(new byte[] { (byte)i, (byte)j });
+                }
+            }
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithOverlongThreeByteSequences_ReturnsInvalid()
+        {
+            // [ E0 ] [ 80..9F ] [ 80..BF ] is overlong 3-byte sequence
+
+            for (int i = 0x00; i < 0xA0; i++)
+            {
+                AssertIsInvalidThreeByteSequence(new byte[] { 0xE0, (byte)i, 0x80 });
+            }
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithSurrogateThreeByteSequences_ReturnsInvalid()
+        {
+            // [ ED ] [ A0..BF ] [ 80..BF ] is surrogate 3-byte sequence
+
+            for (int i = 0xA0; i < 0x100; i++)
+            {
+                AssertIsInvalidThreeByteSequence(new byte[] { 0xED, (byte)i, 0x80 });
+            }
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithImproperlyTerminatedThreeByteSequence_ReturnsInvalid()
+        {
+            // [ E0..EF ] [ 80..BF ] [ !(80..BF) ] is improperly terminated 3-byte sequence
+
+            for (int i = 0xE0; i < 0xF0; i++)
+            {
+                for (int j = 0x00; j < 0x80; j++)
+                {
+                    // Use both '9F' and 'A0' to make sure at least one isn't caught by overlong / surrogate checks
+                    AssertIsInvalidThreeByteSequence(new byte[] { (byte)i, 0x9F, (byte)j });
+                    AssertIsInvalidThreeByteSequence(new byte[] { (byte)i, 0xA0, (byte)j });
+                }
+                for (int j = 0xC0; j < 0x100; j++)
+                {
+                    // Use both '9F' and 'A0' to make sure at least one isn't caught by overlong / surrogate checks
+                    AssertIsInvalidThreeByteSequence(new byte[] { (byte)i, 0x9F, (byte)j });
+                    AssertIsInvalidThreeByteSequence(new byte[] { (byte)i, 0xA0, (byte)j });
+                }
+            }
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithOverlongFourByteSequences_ReturnsInvalid()
+        {
+            // [ F0 ] [ 80..8F ] [ 80..BF ] [ 80..BF ] is overlong 4-byte sequence
+
+            for (int i = 0x00; i < 0x90; i++)
+            {
+                AssertIsInvalidFourByteSequence(new byte[] { 0xF0, (byte)i, 0x80, 0x80 });
+            }
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithOutOfRangeFourByteSequences_ReturnsInvalid()
+        {
+            // [ F4 ] [ 90..BF ] [ 80..BF ] [ 80..BF ] is out-of-range 4-byte sequence
+
+            for (int i = 0x90; i < 0x100; i++)
+            {
+                AssertIsInvalidFourByteSequence(new byte[] { 0xF4, (byte)i, 0x80, 0x80 });
+            }
+        }
+
+        [Fact]
+        public void GetIndexOfFirstInvalidUtf8Sequence_WithInvalidFourByteSequence_ReturnsInvalid()
+        {
+            // [ F0..F4 ] [ !(80..BF) ] [ !(80..BF) ] [ !(80..BF) ] is improperly terminated 4-byte sequence
+
+            for (int i = 0xF0; i < 0xF5; i++)
+            {
+                for (int j = 0x00; j < 0x80; j++)
+                {
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, (byte)j, 0x80, 0x80 });
+
+                    // Use both '8F' and '90' to make sure at least one isn't caught by overlong / out-of-range checks
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x9F, (byte)j, 0x80 });
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0xA0, (byte)j, 0x80 });
+
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x9F, 0x80, (byte)j });
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0xA0, 0x80, (byte)j });
+                }
+                for (int j = 0xC0; j < 0x100; j++)
+                {
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, (byte)j, 0x80, 0x80 });
+
+                    // Use both '8F' and '90' to make sure at least one isn't caught by overlong / out-of-range checks
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x9F, (byte)j, 0x80 });
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0xA0, (byte)j, 0x80 });
+
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x9F, 0x80, (byte)j });
+                    AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0xA0, 0x80, (byte)j });
+                }
+            }
+        }
+
+        private static void AssertIsInvalidTwoByteSequence(byte[] invalidSequence)
+        {
+            Assert.Equal(2, invalidSequence.Length);
+
+            byte[] knownGoodBytes = Utf8Tests.DecodeHex(E_ACUTE);
+
+            byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of first DWORD
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0);
+
+            toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of first DWORD
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 2, 1, 0);
+
+            // Run the same tests but with extra data at the beginning so that we're inside one of
+            // the 2-byte processing "hot loop" code paths.
+
+            toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of next DWORD
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 4, 2, 0);
+
+            toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of next DWORD
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 6, 3, 0);
+        }
+
+        private static void AssertIsInvalidThreeByteSequence(byte[] invalidSequence)
+        {
+            Assert.Equal(3, invalidSequence.Length);
+
+            byte[] knownGoodBytes = Utf8Tests.DecodeHex(EURO_SYMBOL);
+
+            byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of first DWORD
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0);
+
+            // Run the same tests but with extra data at the beginning so that we're inside one of
+            // the 3-byte processing "hot loop" code paths.
+
+            toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // straddling first and second DWORDs
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 3, 1, 0);
+
+            toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // straddling second and third DWORDs
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 6, 2, 0);
+
+            toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of third DWORD
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 9, 3, 0);
+        }
+
+        private static void AssertIsInvalidFourByteSequence(byte[] invalidSequence)
+        {
+            Assert.Equal(4, invalidSequence.Length);
+
+            byte[] knownGoodBytes = Utf8Tests.DecodeHex(GRINNING_FACE);
+
+            byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray();
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0);
+
+            toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray();
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 4, 1, 1);
+        }
+
+        private static void GetIndexOfFirstInvalidUtf8Sequence_Test_Core(string inputHex, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount)
+        {
+            byte[] inputBytes = Utf8Tests.DecodeHex(inputHex);
+
+            // Run the test normally
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(inputBytes, expectedRetVal, expectedRuneCount, expectedSurrogatePairCount);
+
+            // Then run the test with a bunch of ASCII data at the beginning (to exercise the vectorized code paths)
+            inputBytes = Enumerable.Repeat((byte)'x', 128).Concat(inputBytes).ToArray();
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(inputBytes, (expectedRetVal < 0) ? expectedRetVal : (expectedRetVal + 128), expectedRuneCount + 128, expectedSurrogatePairCount);
+
+            // Then put a few more ASCII bytes at the beginning (to test that offsets are properly handled)
+            inputBytes = Enumerable.Repeat((byte)'x', 7).Concat(inputBytes).ToArray();
+            GetIndexOfFirstInvalidUtf8Sequence_Test_Core(inputBytes, (expectedRetVal < 0) ? expectedRetVal : (expectedRetVal + 135), expectedRuneCount + 135, expectedSurrogatePairCount);
+        }
+
+        private static unsafe void GetIndexOfFirstInvalidUtf8Sequence_Test_Core(byte[] input, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount)
+        {
+            // Arrange
+
+            using BoundedMemory<byte> boundedMemory = BoundedMemory.AllocateFromExistingData(input);
+            boundedMemory.MakeReadonly();
+
+            // Act
+
+            int actualRetVal;
+            int actualSurrogatePairCount;
+            int actualRuneCount;
+
+            fixed (byte* pInputBuffer = &MemoryMarshal.GetReference(boundedMemory.Span))
+            {
+                byte* pFirstInvalidByte = Utf8Utility.GetPointerToFirstInvalidByte(pInputBuffer, input.Length, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment);
+
+                long ptrDiff = pFirstInvalidByte - pInputBuffer;
+                Assert.True((ulong)ptrDiff <= (uint)input.Length, "ptrDiff was outside expected range.");
+
+                Assert.True(utf16CodeUnitCountAdjustment <= 0, "UTF-16 code unit count adjustment must be 0 or negative.");
+                Assert.True(scalarCountAdjustment <= 0, "Scalar count adjustment must be 0 or negative.");
+
+                actualRetVal = (ptrDiff == input.Length) ? -1 : (int)ptrDiff;
+
+                // The last two 'out' parameters are:
+                // a) The number to be added to the "bytes processed" return value to come up with the total UTF-16 code unit count, and
+                // b) The number to be added to the "total UTF-16 code unit count" value to come up with the total scalar count.
+
+                int totalUtf16CodeUnitCount = (int)ptrDiff + utf16CodeUnitCountAdjustment;
+                actualRuneCount = totalUtf16CodeUnitCount + scalarCountAdjustment;
+
+                // Surrogate pair count is number of UTF-16 code units less the number of scalars.
+
+                actualSurrogatePairCount = totalUtf16CodeUnitCount - actualRuneCount;
+            }
+
+            // Assert
+
+            Assert.Equal(expectedRetVal, actualRetVal);
+            Assert.Equal(expectedRuneCount, actualRuneCount);
+            Assert.Equal(expectedSurrogatePairCount, actualSurrogatePairCount);
+        }
+    }
+}
+#endif

From ed6cec8513c78d6a70ed547d2f22303f2afcb41d Mon Sep 17 00:00:00 2001
From: cuteant <cuteant@outlook.com>
Date: Thu, 24 Jun 2021 21:41:09 +0800
Subject: [PATCH 3/5] Update ByteBufferReader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* CoreLib(NET5.0-SDK)的部分API没有开放，SpanHelper还是只能使用.NET Core 3.1版本。

* Common/Buffer：NET5.0框架下，移除对SpanHelper的依赖。
---
 .../AbstractByteBuffer.NetStandard.cs         |  42 ++-
 .../ByteBufferUtil.Comparable.cs              |   8 +-
 .../ByteBufferUtil.Equatable.cs               |   8 +-
 .../Reader/ByteBufferReader.Search.cs         |  96 +++++
 .../Reader/ByteBufferReader.cs                |  61 +++-
 .../ByteBufferReaderExtensions.Binary.cs      |   4 +-
 .../AppendableCharSequence.NetStandard.cs     |  16 +-
 .../Internal/PlatformDependent.cs             |   4 +
 .../Internal/Utf8Utility.Validation.cs        |   2 +-
 .../Utilities/AsciiString.NetStandard.cs      |  62 ++++
 src/DotNetty.Common/Utilities/AsciiString.cs  |   4 +
 src/DotNetty.Common/Utilities/CharUtil.cs     |  22 ++
 .../Utilities/ICharSequenceExtensions.cs      |  11 +
 src/DotNetty.Common/Utilities/StringUtil.cs   |   8 +-
 ...DotNetty.Buffers.ReaderWriter.Tests.csproj |   2 +-
 .../test_corefx/BasicTests.cs                 | 341 +++++++++++++++++-
 .../test_corefx/ReadTo.cs                     |  44 ++-
 .../test_corefxlab/ReaderBasicTests.cs        |   2 +-
 18 files changed, 716 insertions(+), 21 deletions(-)

diff --git a/src/DotNetty.Buffers/AbstractByteBuffer.NetStandard.cs b/src/DotNetty.Buffers/AbstractByteBuffer.NetStandard.cs
index 3fd3c4d63..17cf7e304 100644
--- a/src/DotNetty.Buffers/AbstractByteBuffer.NetStandard.cs
+++ b/src/DotNetty.Buffers/AbstractByteBuffer.NetStandard.cs
@@ -206,7 +206,7 @@ public virtual int ReadBytes(Span<byte> destination)
             var readableBytes = Math.Min(_writerIndex - readerIndex, destination.Length);
             if (readableBytes > 0)
             {
-                _GetBytes(readerIndex, destination, readableBytes); 
+                _GetBytes(readerIndex, destination, readableBytes);
                 _readerIndex = readerIndex + readableBytes;
             }
             return readableBytes;
@@ -398,14 +398,22 @@ public virtual int IndexOf(int fromIndex, int toIndex, byte value)
         internal protected virtual int IndexOf0(int index, int count, byte value)
         {
             var span = GetReadableSpan(index, count);
+#if NET
+            var result = span.IndexOf(value);
+#else
             var result = SpanHelpers.IndexOf(ref MemoryMarshal.GetReference(span), value, span.Length);
+#endif
             return (uint)result < SharedConstants.uIndexNotFound ? index + result : result;
         }
 
         internal protected virtual int LastIndexOf0(int index, int count, byte value)
         {
             var span = GetReadableSpan(index, count);
+#if NET
+            var result = span.LastIndexOf(value);
+#else
             var result = SpanHelpers.LastIndexOf(ref MemoryMarshal.GetReference(span), value, span.Length);
+#endif
             return (uint)result < SharedConstants.uIndexNotFound ? index + result : result;
         }
 
@@ -431,14 +439,22 @@ public virtual int IndexOf(int fromIndex, int toIndex, in ReadOnlySpan<byte> val
         internal protected virtual int IndexOf0(int index, int count, in ReadOnlySpan<byte> values)
         {
             var span = GetReadableSpan(index, count);
+#if NET
+            var result = span.IndexOf(values);
+#else
             var result = SpanHelpers.IndexOf(ref MemoryMarshal.GetReference(span), span.Length, ref MemoryMarshal.GetReference(values), values.Length);
+#endif
             return (uint)result < SharedConstants.uIndexNotFound ? index + result : result;
         }
 
         internal protected virtual int LastIndexOf0(int index, int count, in ReadOnlySpan<byte> values)
         {
             var span = GetReadableSpan(index, count);
+#if NET
+            var result = span.LastIndexOf(values);
+#else
             var result = SpanHelpers.LastIndexOf(ref MemoryMarshal.GetReference(span), span.Length, ref MemoryMarshal.GetReference(values), values.Length);
+#endif
             return (uint)result < SharedConstants.uIndexNotFound ? index + result : result;
         }
 
@@ -464,14 +480,22 @@ public virtual int IndexOfAny(int fromIndex, int toIndex, byte value0, byte valu
         internal protected virtual int IndexOfAny0(int index, int count, byte value0, byte value1)
         {
             var span = GetReadableSpan(index, count);
+#if NET
+            var result = span.IndexOfAny(value0, value1);
+#else
             var result = SpanHelpers.IndexOfAny(ref MemoryMarshal.GetReference(span), value0, value1, span.Length);
+#endif
             return (uint)result < SharedConstants.uIndexNotFound ? index + result : result;
         }
 
         internal protected virtual int LastIndexOfAny0(int index, int count, byte value0, byte value1)
         {
             var span = GetReadableSpan(index, count);
+#if NET
+            var result = span.LastIndexOfAny(value0, value1);
+#else
             var result = SpanHelpers.LastIndexOfAny(ref MemoryMarshal.GetReference(span), value0, value1, span.Length);
+#endif
             return (uint)result < SharedConstants.uIndexNotFound ? index + result : result;
         }
 
@@ -497,14 +521,22 @@ public virtual int IndexOfAny(int fromIndex, int toIndex, byte value0, byte valu
         internal protected virtual int IndexOfAny0(int index, int count, byte value0, byte value1, byte value2)
         {
             var span = GetReadableSpan(index, count);
+#if NET
+            var result = span.IndexOfAny(value0, value1, value2);
+#else
             var result = SpanHelpers.IndexOfAny(ref MemoryMarshal.GetReference(span), value0, value1, value2, span.Length);
+#endif
             return (uint)result < SharedConstants.uIndexNotFound ? index + result : result;
         }
 
         internal protected virtual int LastIndexOfAny0(int index, int count, byte value0, byte value1, byte value2)
         {
             var span = GetReadableSpan(index, count);
+#if NET
+            var result = span.LastIndexOfAny(value0, value1, value2);
+#else
             var result = SpanHelpers.LastIndexOfAny(ref MemoryMarshal.GetReference(span), value0, value1, value2, span.Length);
+#endif
             return (uint)result < SharedConstants.uIndexNotFound ? index + result : result;
         }
 
@@ -530,14 +562,22 @@ public virtual int IndexOfAny(int fromIndex, int toIndex, in ReadOnlySpan<byte>
         internal protected virtual int IndexOfAny0(int index, int count, in ReadOnlySpan<byte> values)
         {
             var span = GetReadableSpan(index, count);
+#if NET
+            var result = span.IndexOfAny(values);
+#else
             var result = SpanHelpers.IndexOfAny(ref MemoryMarshal.GetReference(span), span.Length, ref MemoryMarshal.GetReference(values), values.Length);
+#endif
             return (uint)result < SharedConstants.uIndexNotFound ? index + result : result;
         }
 
         internal protected virtual int LastIndexOfAny0(int index, int count, in ReadOnlySpan<byte> values)
         {
             var span = GetReadableSpan(index, count);
+#if NET
+            var result = span.LastIndexOfAny(values);
+#else
             var result = SpanHelpers.LastIndexOfAny(ref MemoryMarshal.GetReference(span), span.Length, ref MemoryMarshal.GetReference(values), values.Length);
+#endif
             return (uint)result < SharedConstants.uIndexNotFound ? index + result : result;
         }
     }
diff --git a/src/DotNetty.Buffers/ByteBufferUtil.Comparable.cs b/src/DotNetty.Buffers/ByteBufferUtil.Comparable.cs
index eaa292a6e..01b916552 100644
--- a/src/DotNetty.Buffers/ByteBufferUtil.Comparable.cs
+++ b/src/DotNetty.Buffers/ByteBufferUtil.Comparable.cs
@@ -27,9 +27,11 @@ namespace DotNetty.Buffers
 {
     using System;
     using System.Runtime.CompilerServices;
+    using DotNetty.Common.Utilities;
+#if !NET
     using System.Runtime.InteropServices;
     using DotNetty.Common.Internal;
-    using DotNetty.Common.Utilities;
+#endif
 
     partial class ByteBufferUtil
     {
@@ -44,7 +46,11 @@ public static int Compare(IByteBuffer bufferA, IByteBuffer bufferB)
             {
                 var spanA = bufferA.GetReadableSpan();
                 var spanB = bufferB.GetReadableSpan();
+#if NET
+                return spanA.SequenceCompareTo(spanB);
+#else
                 return SpanHelpers.SequenceCompareTo(ref MemoryMarshal.GetReference(spanA), spanA.Length, ref MemoryMarshal.GetReference(spanB), spanB.Length);
+#endif
             }
             return CompareSlow(bufferA, bufferB);
         }
diff --git a/src/DotNetty.Buffers/ByteBufferUtil.Equatable.cs b/src/DotNetty.Buffers/ByteBufferUtil.Equatable.cs
index 53b1ce454..04376882d 100644
--- a/src/DotNetty.Buffers/ByteBufferUtil.Equatable.cs
+++ b/src/DotNetty.Buffers/ByteBufferUtil.Equatable.cs
@@ -26,9 +26,11 @@
 namespace DotNetty.Buffers
 {
     using System;
+    using DotNetty.Common.Utilities;
+#if !NET
     using System.Runtime.InteropServices;
     using DotNetty.Common.Internal;
-    using DotNetty.Common.Utilities;
+#endif
 
     partial class ByteBufferUtil
     {
@@ -55,7 +57,11 @@ public static bool Equals(IByteBuffer a, int aStartIndex, IByteBuffer b, int bSt
             {
                 var spanA = a.GetReadableSpan(aStartIndex, length);
                 var spanB = b.GetReadableSpan(bStartIndex, length);
+#if NET
+                return spanA.SequenceEqual(spanB);
+#else
                 return SpanHelpers.SequenceEqual(ref MemoryMarshal.GetReference(spanA), ref MemoryMarshal.GetReference(spanB), length);
+#endif
             }
             return EqualsSlow(a, aStartIndex, b, bStartIndex, length);
         }
diff --git a/src/DotNetty.Buffers/Reader/ByteBufferReader.Search.cs b/src/DotNetty.Buffers/Reader/ByteBufferReader.Search.cs
index 4f03c4be2..49b61808b 100644
--- a/src/DotNetty.Buffers/Reader/ByteBufferReader.Search.cs
+++ b/src/DotNetty.Buffers/Reader/ByteBufferReader.Search.cs
@@ -45,7 +45,11 @@ partial struct ByteBufferReader
         public bool TryReadTo(out ReadOnlySpan<byte> span, byte delimiter, bool advancePastDelimiter = true)
         {
             ReadOnlySpan<byte> remaining = UnreadSpan;
+#if NET
+            int index = remaining.IndexOf(delimiter);
+#else
             int index = SpanHelpers.IndexOf(ref MemoryMarshal.GetReference(remaining), delimiter, remaining.Length);
+#endif
 
             uint uIndex = (uint)index;
             if (SharedConstants.TooBigOrNegative >= uIndex) // index != -1
@@ -81,7 +85,11 @@ private bool TryReadToSlow(out ReadOnlySpan<byte> span, byte delimiter, bool adv
         public bool TryReadTo(out ReadOnlySpan<byte> span, byte delimiter, byte delimiterEscape, bool advancePastDelimiter = true)
         {
             ReadOnlySpan<byte> remaining = UnreadSpan;
+#if NET
+            int index = remaining.IndexOf(delimiter);
+#else
             int index = SpanHelpers.IndexOf(ref MemoryMarshal.GetReference(remaining), delimiter, remaining.Length);
+#endif
 
             if ((index > 0 && remaining[index - 1] != delimiterEscape) || 0u >= (uint)index)
             {
@@ -199,7 +207,11 @@ ref MemoryMarshal.GetReference(remaining),
                 remaining = _currentSpan;
 
             Continue:
+#if NET
+                index = remaining.IndexOf(delimiter);
+#else
                 index = SpanHelpers.IndexOf(ref MemoryMarshal.GetReference(remaining), delimiter, remaining.Length);
+#endif
             } while (!End);
 
             // Didn't find anything, reset our original state.
@@ -227,7 +239,11 @@ private bool TryReadToInternal(out ReadOnlySequence<byte> sequence, byte delimit
 
             while (_moreData)
             {
+#if NET
+                int index = remaining.IndexOf(delimiter);
+#else
                 int index = SpanHelpers.IndexOf(ref MemoryMarshal.GetReference(remaining), delimiter, remaining.Length);
+#endif
                 uint uIndex = (uint)index;
                 if (SharedConstants.TooBigOrNegative >= uIndex) // index != -1
                 {
@@ -271,7 +287,11 @@ public bool TryReadTo(out ReadOnlySequence<byte> sequence, byte delimiter, byte
 
             while (_moreData)
             {
+#if NET
+                int index = remaining.IndexOf(delimiter);
+#else
                 int index = SpanHelpers.IndexOf(ref MemoryMarshal.GetReference(remaining), delimiter, remaining.Length);
+#endif
                 uint uIndex = (uint)index;
                 if (SharedConstants.TooBigOrNegative >= uIndex) // index != -1
                 {
@@ -354,7 +374,13 @@ ref MemoryMarshal.GetReference(remaining),
         public bool TryReadToAny(out ReadOnlySpan<byte> span, in ReadOnlySpan<byte> delimiters, bool advancePastDelimiter = true)
         {
             ReadOnlySpan<byte> remaining = UnreadSpan;
+#if NET
+            int index = delimiters.Length == 2
+                ? remaining.IndexOfAny(delimiters[0], delimiters[1])
+                : remaining.IndexOfAny(delimiters);
+#else
             var index = SpanHelpers.IndexOfAny(ref MemoryMarshal.GetReference(remaining), remaining.Length, ref MemoryMarshal.GetReference(delimiters), delimiters.Length);
+#endif
 
             if (SharedConstants.TooBigOrNegative >= (uint)index) // index != -1
             {
@@ -399,7 +425,13 @@ private bool TryReadToAnyInternal(out ReadOnlySequence<byte> sequence, in ReadOn
 
             while (!End)
             {
+#if NET
+                int index = delimiters.Length == 2
+                    ? remaining.IndexOfAny(delimiters[0], delimiters[1])
+                    : remaining.IndexOfAny(delimiters);
+#else
                 int index = SpanHelpers.IndexOfAny(ref MemoryMarshal.GetReference(remaining), remaining.Length, ref delimiterSpace, delimiters.Length);
+#endif
                 uint uIndex = (uint)index;
                 if (SharedConstants.TooBigOrNegative >= uIndex) // index != -1
                 {
@@ -421,6 +453,46 @@ private bool TryReadToAnyInternal(out ReadOnlySequence<byte> sequence, in ReadOn
             return false;
         }
 
+        /// <summary>
+        /// Try to read everything up to the given <paramref name="delimiter"/>.
+        /// </summary>
+        /// <param name="span">The read data, if any.</param>
+        /// <param name="delimiter">The delimiter to look for.</param>
+        /// <param name="advancePastDelimiter">True to move past the <paramref name="delimiter"/> if found.</param>
+        /// <returns>True if the <paramref name="delimiter"/> was found.</returns>
+        public bool TryReadTo(out ReadOnlySpan<byte> span, ReadOnlySpan<byte> delimiter, bool advancePastDelimiter = true)
+        {
+            ReadOnlySpan<byte> remaining = UnreadSpan;
+#if NET
+            int index = remaining.IndexOf(delimiter);
+#else
+            int index = SpanHelpers.IndexOf(ref MemoryMarshal.GetReference(remaining), remaining.Length, ref MemoryMarshal.GetReference(delimiter), delimiter.Length);
+#endif
+
+            if (index >= 0)
+            {
+                span = remaining.Slice(0, index);
+                AdvanceCurrentSpan(index + (advancePastDelimiter ? delimiter.Length : 0));
+                return true;
+            }
+
+            // This delimiter might be skipped, go down the slow path
+            return TryReadToSlow(out span, delimiter, advancePastDelimiter);
+        }
+
+        private bool TryReadToSlow(out ReadOnlySpan<byte> span, ReadOnlySpan<byte> delimiter, bool advancePastDelimiter)
+        {
+            if (!TryReadTo(out ReadOnlySequence<byte> sequence, delimiter, advancePastDelimiter))
+            {
+                span = default;
+                return false;
+            }
+
+            Debug.Assert(sequence.Length > 0);
+            span = sequence.IsSingleSegment ? sequence.First.Span : sequence.ToArray();
+            return true;
+        }
+
         /// <summary>Try to read data until the entire given <paramref name="delimiter"/> matches.</summary>
         /// <param name="sequence">The read data, if any.</param>
         /// <param name="delimiter">The multi (byte) delimiter.</param>
@@ -487,7 +559,11 @@ public bool TryReadTo(out ReadOnlySequence<byte> sequence, in ReadOnlySpan<byte>
         public bool TryAdvanceTo(byte delimiter, bool advancePastDelimiter = true)
         {
             ReadOnlySpan<byte> remaining = UnreadSpan;
+#if NET
+            int index = remaining.IndexOf(delimiter);
+#else
             int index = SpanHelpers.IndexOf(ref MemoryMarshal.GetReference(remaining), delimiter, remaining.Length);
+#endif
             if (SharedConstants.TooBigOrNegative >= (uint)index) // ndex != -1
             {
                 Advance(advancePastDelimiter ? index + 1 : index);
@@ -504,7 +580,11 @@ public bool TryAdvanceTo(byte delimiter, bool advancePastDelimiter = true)
         public bool TryAdvanceToAny(in ReadOnlySpan<byte> delimiters, bool advancePastDelimiter = true)
         {
             ReadOnlySpan<byte> remaining = UnreadSpan;
+#if NET
+            int index = remaining.IndexOfAny(delimiters);
+#else
             int index = SpanHelpers.IndexOfAny(ref MemoryMarshal.GetReference(remaining), remaining.Length, ref MemoryMarshal.GetReference(delimiters), delimiters.Length);
+#endif
             if (SharedConstants.TooBigOrNegative >= (uint)index) // ndex != -1
             {
                 AdvanceCurrentSpan(index + (advancePastDelimiter ? 1 : 0));
@@ -669,6 +749,22 @@ ref MemoryMarshal.GetReference(searchSpan),
             return _consumed - start;
         }
 
+        /// <summary>
+        /// Moves the reader to the end of the sequence.
+        /// </summary>
+        public void AdvanceToEnd()
+        {
+            if (_moreData)
+            {
+                Consumed = Length;
+                CurrentSpan = default;
+                CurrentSpanIndex = 0;
+                _currentPosition = Sequence.End;
+                _nextPosition = default;
+                _moreData = false;
+            }
+        }
+
         /// <summary>Check to see if the given <paramref name="next"/> value is next.</summary>
         /// <param name="next">The value to compare the next items to.</param>
         /// <param name="advancePast">Move past the <paramref name="next"/> value if found.</param>
diff --git a/src/DotNetty.Buffers/Reader/ByteBufferReader.cs b/src/DotNetty.Buffers/Reader/ByteBufferReader.cs
index 685bfa19f..4fc7caa7b 100644
--- a/src/DotNetty.Buffers/Reader/ByteBufferReader.cs
+++ b/src/DotNetty.Buffers/Reader/ByteBufferReader.cs
@@ -40,7 +40,7 @@ public ref partial struct ByteBufferReader
         private SequencePosition _currentPosition;
         private SequencePosition _nextPosition;
         private bool _moreData;
-        private long _length;
+        private readonly long _length;
 
         private readonly ReadOnlySequence<byte> _sequence;
         private ReadOnlySpan<byte> _currentSpan;
@@ -107,6 +107,10 @@ public readonly bool End
         /// <summary>The underlying <see cref="ReadOnlySequence{T}"/> for the reader.</summary>
         public readonly ReadOnlySequence<byte> Sequence => _sequence;
 
+        /// <summary>Gets the unread portion of the <see cref="Sequence"/>.</summary>
+        /// <value>The unread portion of the <see cref="Sequence"/>.</value>
+        public readonly ReadOnlySequence<byte> UnreadSequence => Sequence.Slice(Position);
+
         /// <summary>The current position in the <see cref="Sequence"/>.</summary>
         public readonly SequencePosition Position
             => _sequence.GetPosition(_currentSpanIndex, _currentPosition);
@@ -172,6 +176,61 @@ public readonly bool TryPeek(out byte value)
             return false;
         }
 
+        /// <summary>Peeks at the next value at specific offset without advancing the reader.</summary>
+        /// <param name="offset">The offset from current position.</param>
+        /// <param name="value">The next value, or the default value if at the end of the reader.</param>
+        /// <returns><c>true</c> if the reader is not at its end and the peek operation succeeded; <c>false</c> if at the end of the reader.</returns>
+        public readonly bool TryPeek(long offset, out byte value)
+        {
+            if (offset < 0L) { ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.offset); }
+
+            // If we've got data and offset is not out of bounds
+            if (!_moreData || Remaining <= offset)
+            {
+                value = default;
+                return false;
+            }
+
+            // Sum CurrentSpanIndex + offset could overflow as is but the value of offset should be very large
+            // because we check Remaining <= offset above so to overflow we should have a ReadOnlySequence close to 8 exabytes
+            Debug.Assert(CurrentSpanIndex + offset >= 0);
+
+            // If offset doesn't fall inside current segment move to next until we find correct one
+            if ((CurrentSpanIndex + offset) <= CurrentSpan.Length - 1)
+            {
+                Debug.Assert(offset <= int.MaxValue);
+
+                value = CurrentSpan[CurrentSpanIndex + (int)offset];
+                return true;
+            }
+            else
+            {
+                long remainingOffset = offset - (CurrentSpan.Length - CurrentSpanIndex);
+                SequencePosition nextPosition = _nextPosition;
+                ReadOnlyMemory<byte> currentMemory;
+
+                while (Sequence.TryGet(ref nextPosition, out currentMemory, advance: true))
+                {
+                    // Skip empty segment
+                    if (currentMemory.Length > 0)
+                    {
+                        if (remainingOffset >= currentMemory.Length)
+                        {
+                            // Subtract current non consumed data
+                            remainingOffset -= currentMemory.Length;
+                        }
+                        else
+                        {
+                            break;
+                        }
+                    }
+                }
+
+                value = currentMemory.Span[(int)remainingOffset];
+                return true;
+            }
+        }
+
         /// <summary>Read the next value and advance the reader.</summary>
         /// <param name="value">The next value or default if at the end.</param>
         /// <returns>False if at the end of the reader.</returns>
diff --git a/src/DotNetty.Buffers/Reader/ByteBufferReaderExtensions.Binary.cs b/src/DotNetty.Buffers/Reader/ByteBufferReaderExtensions.Binary.cs
index 8d12106f9..31b54cb39 100644
--- a/src/DotNetty.Buffers/Reader/ByteBufferReaderExtensions.Binary.cs
+++ b/src/DotNetty.Buffers/Reader/ByteBufferReaderExtensions.Binary.cs
@@ -248,7 +248,7 @@ private static bool TryReadReverseEndianness(ref ByteBufferReader reader, out us
 
         public static unsafe bool TryReadUnsignedMedium(ref this ByteBufferReader reader, out int value)
         {
-            if (reader.TryPeek(MediumSize, out var span))
+            if (reader.TryPeek(MediumSize, out ReadOnlySpan<byte> span))
             {
                 //fixed (byte* bytes = &MemoryMarshal.GetReference(span))
                 //{
@@ -264,7 +264,7 @@ public static unsafe bool TryReadUnsignedMedium(ref this ByteBufferReader reader
 
         public static bool TryReadUnsignedMediumLE(ref this ByteBufferReader reader, out int value)
         {
-            if (reader.TryPeek(MediumSize, out var span))
+            if (reader.TryPeek(MediumSize, out ReadOnlySpan<byte> span))
             {
                 ref byte b = ref MemoryMarshal.GetReference(span);
                 value = b | Unsafe.Add(ref b, 1) << 8 | Unsafe.Add(ref b, 2) << 16;
diff --git a/src/DotNetty.Common/Internal/AppendableCharSequence.NetStandard.cs b/src/DotNetty.Common/Internal/AppendableCharSequence.NetStandard.cs
index 4ff856bb8..ff43815db 100644
--- a/src/DotNetty.Common/Internal/AppendableCharSequence.NetStandard.cs
+++ b/src/DotNetty.Common/Internal/AppendableCharSequence.NetStandard.cs
@@ -5,8 +5,10 @@ namespace DotNetty.Common.Internal
 {
     using System;
     using System.Runtime.CompilerServices;
-    using System.Runtime.InteropServices;
     using DotNetty.Common.Utilities;
+#if !NET
+    using System.Runtime.InteropServices;
+#endif
 
     partial class AppendableCharSequence : IHasAsciiSpan
     {
@@ -28,8 +30,12 @@ public bool Equals(AppendableCharSequence other)
                 return true;
             }
 
+#if NET
+            return other is object && AsciiSpan.SequenceEqual(other.AsciiSpan);
+#else
             return other is object && _pos == other._pos
                 && SpanHelpers.SequenceEqual(ref MemoryMarshal.GetReference(AsciiSpan), ref MemoryMarshal.GetReference(other.AsciiSpan), _pos);
+#endif
         }
 
         public override bool Equals(object obj)
@@ -39,8 +45,12 @@ public override bool Equals(object obj)
             switch (obj)
             {
                 case AppendableCharSequence other:
+#if NET
+                    return AsciiSpan.SequenceEqual(other.AsciiSpan);
+#else
                     return _pos == other._pos
                         && SpanHelpers.SequenceEqual(ref MemoryMarshal.GetReference(AsciiSpan), ref MemoryMarshal.GetReference(other.AsciiSpan), _pos);
+#endif
 
                 case IHasAsciiSpan hasAscii:
                     return AsciiSpan.SequenceEqual(hasAscii.AsciiSpan);
@@ -63,8 +73,12 @@ bool IEquatable<ICharSequence>.Equals(ICharSequence other)
                     return false;
 
                 case AppendableCharSequence comparand:
+#if NET
+                    return AsciiSpan.SequenceEqual(comparand.AsciiSpan);
+#else
                     return _pos == comparand._pos
                         && SpanHelpers.SequenceEqual(ref MemoryMarshal.GetReference(AsciiSpan), ref MemoryMarshal.GetReference(comparand.AsciiSpan), _pos);
+#endif
 
                 case IHasAsciiSpan hasAscii:
                     return AsciiSpan.SequenceEqual(hasAscii.AsciiSpan);
diff --git a/src/DotNetty.Common/Internal/PlatformDependent.cs b/src/DotNetty.Common/Internal/PlatformDependent.cs
index b98391d6e..c56552e62 100644
--- a/src/DotNetty.Common/Internal/PlatformDependent.cs
+++ b/src/DotNetty.Common/Internal/PlatformDependent.cs
@@ -59,7 +59,11 @@ public static unsafe bool ByteArrayEquals(byte[] bytes1, int startPos1, byte[] b
                 return true;
             }
 
+#if NET
+            return new ReadOnlySpan<byte>(bytes1, startPos1, length).SequenceEqual(new ReadOnlySpan<byte>(bytes2, startPos2, length));
+#else
             return SpanHelpers.SequenceEqual(ref bytes1[startPos1], ref bytes2[startPos2], length);
+#endif
         }
 
         public static unsafe int ByteArrayEqualsConstantTime(byte[] bytes1, int startPos1, byte[] bytes2, int startPos2, int length)
diff --git a/src/DotNetty.Common/Internal/Utf8Utility.Validation.cs b/src/DotNetty.Common/Internal/Utf8Utility.Validation.cs
index 2141f1f2a..4b1c3effd 100644
--- a/src/DotNetty.Common/Internal/Utf8Utility.Validation.cs
+++ b/src/DotNetty.Common/Internal/Utf8Utility.Validation.cs
@@ -42,7 +42,7 @@ unsafe partial class Utf8Utility
                 // If so, short-circuit the remainder of the method.
 
                 inputLength -= (int)numAsciiBytesCounted;
-                if (0u >= inputLength)
+                if (0u >= (uint)inputLength)
                 {
                     utf16CodeUnitCountAdjustment = 0;
                     scalarCountAdjustment = 0;
diff --git a/src/DotNetty.Common/Utilities/AsciiString.NetStandard.cs b/src/DotNetty.Common/Utilities/AsciiString.NetStandard.cs
index d6e720551..9963e9e77 100644
--- a/src/DotNetty.Common/Utilities/AsciiString.NetStandard.cs
+++ b/src/DotNetty.Common/Utilities/AsciiString.NetStandard.cs
@@ -178,13 +178,25 @@ public bool ContentEquals(ICharSequence other)
             {
                 case AsciiString asciiStr:
                     return this.GetHashCode() == asciiStr.GetHashCode()
+#if NET
+                        && this.AsciiSpan.SequenceEqual(asciiStr.AsciiSpan);
+#else
                         && SpanHelpers.SequenceEqual(ref MemoryMarshal.GetReference(this.AsciiSpan), ref MemoryMarshal.GetReference(asciiStr.AsciiSpan), thisLength);
+#endif
 
                 case IHasAsciiSpan hasAscii:
+#if NET
+                    return this.AsciiSpan.SequenceEqual(hasAscii.AsciiSpan);
+#else
                     return SpanHelpers.SequenceEqual(ref MemoryMarshal.GetReference(this.AsciiSpan), ref MemoryMarshal.GetReference(hasAscii.AsciiSpan), thisLength);
+#endif
 
                 case IHasUtf16Span hasUtf16:
+#if NET
+                    return this.Utf16Span.SequenceEqual(hasUtf16.Utf16Span);
+#else
                     return SpanHelpers.SequenceEqual(ref MemoryMarshal.GetReference(this.Utf16Span), ref MemoryMarshal.GetReference(hasUtf16.Utf16Span), thisLength);
+#endif
 
                 default:
                     return ContentEquals0(other);
@@ -292,27 +304,43 @@ public int IndexOf(ICharSequence subString, int start)
             {
                 if (subString is IHasAsciiSpan hasAscii)
                 {
+#if NET
+                    return this.AsciiSpan.IndexOf(hasAscii.AsciiSpan);
+#else
                     return SpanHelpers.IndexOf(ref MemoryMarshal.GetReference(this.AsciiSpan), thisLen, ref MemoryMarshal.GetReference(hasAscii.AsciiSpan), subCount);
+#endif
                 }
                 if (subString is IHasUtf16Span hasUtf16)
                 {
+#if NET
+                    return this.Utf16Span.IndexOf(hasUtf16.Utf16Span);
+#else
                     return SpanHelpers.IndexOf(ref MemoryMarshal.GetReference(this.Utf16Span), thisLen, ref MemoryMarshal.GetReference(hasUtf16.Utf16Span), subCount);
+#endif
                 }
             }
             else
             {
                 if (subString is IHasAsciiSpan hasAscii)
                 {
+#if NET
+                    var result = this.AsciiSpan.Slice(start, searchLen).IndexOf(hasAscii.AsciiSpan);
+#else
                     var result = SpanHelpers.IndexOf(
                         ref Unsafe.Add(ref MemoryMarshal.GetReference(this.AsciiSpan), start), searchLen,
                         ref MemoryMarshal.GetReference(hasAscii.AsciiSpan), subCount);
+#endif
                     return SharedConstants.TooBigOrNegative >= (uint)result ? start + result : IndexNotFound;
                 }
                 if (subString is IHasUtf16Span hasUtf16)
                 {
+#if NET
+                    var result = this.Utf16Span.Slice(start, searchLen).IndexOf(hasUtf16.Utf16Span);
+#else
                     var result = SpanHelpers.IndexOf(
                         ref Unsafe.Add(ref MemoryMarshal.GetReference(this.Utf16Span), start), searchLen,
                         ref MemoryMarshal.GetReference(hasUtf16.Utf16Span), subCount);
+#endif
                     return SharedConstants.TooBigOrNegative >= (uint)result ? start + result : IndexNotFound;
                 }
             }
@@ -364,10 +392,18 @@ public int IndexOf(char ch, int start)
 
             if (0u >= uStart)
             {
+#if NET
+                return this.AsciiSpan.IndexOf((byte)ch);
+#else
                 return SpanHelpers.IndexOf(ref MemoryMarshal.GetReference(this.AsciiSpan), (byte)ch, thisLen);
+#endif
             }
             var seachSpan = this.AsciiSpan.Slice(start);
+#if NET
+            var result = seachSpan.IndexOf((byte)ch);
+#else
             var result = SpanHelpers.IndexOf(ref MemoryMarshal.GetReference(seachSpan), (byte)ch, seachSpan.Length);
+#endif
             return SharedConstants.TooBigOrNegative >= (uint)result ? start + result : IndexNotFound;
         }
 
@@ -386,15 +422,33 @@ public int LastIndexOf(ICharSequence subString, int start)
 
             if (subString is IHasAsciiSpan hasAscii)
             {
+#if NET
+                var searchLength = start + subCount;
+                if (searchLength > thisLen)
+                {
+                    return this.AsciiSpan.LastIndexOf(hasAscii.AsciiSpan);
+                }
+                return this.AsciiSpan.Slice(0, searchLength).LastIndexOf(hasAscii.AsciiSpan);
+#else
                 return SpanHelpers.LastIndexOf(
                     ref MemoryMarshal.GetReference(this.AsciiSpan), start + subCount,
                     ref MemoryMarshal.GetReference(hasAscii.AsciiSpan), subCount);
+#endif
             }
             if (subString is IHasUtf16Span hasUtf16)
             {
+#if NET
+                var searchLength = start + subCount;
+                if (searchLength > thisLen)
+                {
+                    return this.Utf16Span.LastIndexOf(hasUtf16.Utf16Span);
+                }
+                return this.Utf16Span.Slice(0, searchLength).LastIndexOf(hasUtf16.Utf16Span);
+#else
                 return SpanHelpers.LastIndexOf(
                     ref MemoryMarshal.GetReference(this.Utf16Span), start + subCount,
                     ref MemoryMarshal.GetReference(hasUtf16.Utf16Span), subCount);
+#endif
             }
 
             return LastIndexOf0(subString, start);
@@ -451,17 +505,25 @@ public bool RegionMatches(int thisStart, ICharSequence seq, int start, int count
 
             if (seq is IHasAsciiSpan hasAscii)
             {
+#if NET
+                return this.AsciiSpan.Slice(thisStart, count).SequenceEqual(hasAscii.AsciiSpan.Slice(start, count));
+#else
                 return SpanHelpers.SequenceEqual(
                     ref Unsafe.Add(ref MemoryMarshal.GetReference(this.AsciiSpan), thisStart),
                     ref Unsafe.Add(ref MemoryMarshal.GetReference(hasAscii.AsciiSpan), start),
                     count);
+#endif
             }
             if (seq is IHasUtf16Span hasUtf16)
             {
+#if NET
+                return this.Utf16Span.Slice(thisStart, count).SequenceEqual(hasUtf16.Utf16Span.Slice(start, count));
+#else
                 return SpanHelpers.SequenceEqual(
                     ref Unsafe.Add(ref MemoryMarshal.GetReference(this.Utf16Span), thisStart),
                     ref Unsafe.Add(ref MemoryMarshal.GetReference(hasUtf16.Utf16Span), start),
                     count);
+#endif
             }
 
             return RegionMatches0(thisStart, seq, start, count);
diff --git a/src/DotNetty.Common/Utilities/AsciiString.cs b/src/DotNetty.Common/Utilities/AsciiString.cs
index 50f79c3d0..4ddaac350 100644
--- a/src/DotNetty.Common/Utilities/AsciiString.cs
+++ b/src/DotNetty.Common/Utilities/AsciiString.cs
@@ -475,9 +475,13 @@ public int CompareTo(AsciiString other)
         {
             if (ReferenceEquals(this, other)) { return 0; }
 
+#if NET
+            return this.AsciiSpan.SequenceCompareTo(other.AsciiSpan);
+#else
             return SpanHelpers.SequenceCompareTo(
                 ref MemoryMarshal.GetReference(this.AsciiSpan), this.length,
                 ref MemoryMarshal.GetReference(other.AsciiSpan), other.Count);
+#endif
         }
 
         public int CompareTo(object obj) => this.CompareTo(obj as AsciiString);
diff --git a/src/DotNetty.Common/Utilities/CharUtil.cs b/src/DotNetty.Common/Utilities/CharUtil.cs
index 617253d82..851dd2206 100644
--- a/src/DotNetty.Common/Utilities/CharUtil.cs
+++ b/src/DotNetty.Common/Utilities/CharUtil.cs
@@ -31,8 +31,10 @@ namespace DotNetty.Common.Utilities
     using System;
     using System.Collections.Generic;
     using System.Runtime.CompilerServices;
+#if !NET
     using System.Runtime.InteropServices;
     using DotNetty.Common.Internal;
+#endif
 
     public static partial class CharUtil
     {
@@ -76,17 +78,25 @@ internal static bool ContentEquals(ICharSequence left, ICharSequence right)
 
             if (left is IHasAsciiSpan thisHasAscii && right is IHasAsciiSpan otherHasAscii)
             {
+#if NET
+                return thisHasAscii.AsciiSpan.SequenceEqual(otherHasAscii.AsciiSpan);
+#else
                 return SpanHelpers.SequenceEqual(
                     ref MemoryMarshal.GetReference(thisHasAscii.AsciiSpan),
                     ref MemoryMarshal.GetReference(otherHasAscii.AsciiSpan),
                     left.Count);
+#endif
             }
             else if (left is IHasUtf16Span thisHasUtf16 && right is IHasUtf16Span otherHasUtf16)
             {
+#if NET
+                return thisHasUtf16.Utf16Span.SequenceEqual(otherHasUtf16.Utf16Span);
+#else
                 return SpanHelpers.SequenceEqual(
                     ref MemoryMarshal.GetReference(thisHasUtf16.Utf16Span),
                     ref MemoryMarshal.GetReference(otherHasUtf16.Utf16Span),
                     left.Count);
+#endif
             }
 
             for (int i = 0; i < left.Count; i++)
@@ -141,10 +151,14 @@ public static bool RegionMatches(string value, int thisStart, ICharSequence othe
 
             if (other is IHasUtf16Span hasUtf16)
             {
+#if NET
+                return value.AsSpan().Slice(thisStart, length).SequenceEqual(hasUtf16.Utf16Span.Slice(start, length));
+#else
                 return SpanHelpers.SequenceEqual(
                     ref Unsafe.Add(ref MemoryMarshal.GetReference(value.AsSpan()), thisStart),
                     ref Unsafe.Add(ref MemoryMarshal.GetReference(hasUtf16.Utf16Span), start),
                     length);
+#endif
             }
             int o1 = thisStart;
             int o2 = start;
@@ -202,17 +216,25 @@ internal static bool RegionMatches(ICharSequence value, int thisStart, ICharSequ
 
             if (value is IHasAsciiSpan thisHasAscii && other is IHasAsciiSpan otherHasAscii)
             {
+#if NET
+                return thisHasAscii.AsciiSpan.Slice(thisStart, length).SequenceEqual(otherHasAscii.AsciiSpan.Slice(start, length));
+#else
                 return SpanHelpers.SequenceEqual(
                     ref Unsafe.Add(ref MemoryMarshal.GetReference(thisHasAscii.AsciiSpan), thisStart),
                     ref Unsafe.Add(ref MemoryMarshal.GetReference(otherHasAscii.AsciiSpan), start),
                     length);
+#endif
             }
             else if (value is IHasUtf16Span thisHasUtf16 && other is IHasUtf16Span otherHasUtf16)
             {
+#if NET
+                return thisHasUtf16.Utf16Span.Slice(thisStart, length).SequenceEqual(otherHasUtf16.Utf16Span.Slice(start, length));
+#else
                 return SpanHelpers.SequenceEqual(
                     ref Unsafe.Add(ref MemoryMarshal.GetReference(thisHasUtf16.Utf16Span), thisStart),
                     ref Unsafe.Add(ref MemoryMarshal.GetReference(otherHasUtf16.Utf16Span), start),
                     length);
+#endif
             }
 
             int o1 = thisStart;
diff --git a/src/DotNetty.Common/Utilities/ICharSequenceExtensions.cs b/src/DotNetty.Common/Utilities/ICharSequenceExtensions.cs
index 7cbf53a7c..cf11589e5 100644
--- a/src/DotNetty.Common/Utilities/ICharSequenceExtensions.cs
+++ b/src/DotNetty.Common/Utilities/ICharSequenceExtensions.cs
@@ -23,8 +23,12 @@
 namespace DotNetty.Common.Utilities
 {
     using System.Runtime.CompilerServices;
+#if NET
+    using System;
+#else
     using System.Runtime.InteropServices;
     using DotNetty.Common.Internal;
+#endif
 
 
     public static class ICharSequenceExtensions
@@ -38,12 +42,19 @@ public static bool Contains(this ICharSequence sequence, char c)
 
                 case IHasAsciiSpan hasAscii:
                     if ((uint)c > AsciiString.uMaxCharValue) { return false; }
+#if NET
+                    return hasAscii.AsciiSpan.Contains((byte)c);
+#else
                     var asciiSpan = hasAscii.AsciiSpan;
                     return SpanHelpers.Contains(ref MemoryMarshal.GetReference(asciiSpan), (byte)c, asciiSpan.Length);
+#endif
 
                 case IHasUtf16Span hasUtf16:
+#if NET
+#else
                     var utf16Span = hasUtf16.Utf16Span;
                     return SpanHelpers.Contains(ref MemoryMarshal.GetReference(utf16Span), c, utf16Span.Length);
+#endif
 
                 default:
                     int length = sequence.Count;
diff --git a/src/DotNetty.Common/Utilities/StringUtil.cs b/src/DotNetty.Common/Utilities/StringUtil.cs
index ed382cfa3..871cff594 100644
--- a/src/DotNetty.Common/Utilities/StringUtil.cs
+++ b/src/DotNetty.Common/Utilities/StringUtil.cs
@@ -31,9 +31,11 @@ namespace DotNetty.Common.Utilities
     using System;
     using System.Collections.Generic;
     using System.Runtime.CompilerServices;
-    using System.Runtime.InteropServices;
     using System.Text;
     using DotNetty.Common.Internal;
+#if !NET
+    using System.Runtime.InteropServices;
+#endif
 
     /// <summary>
     ///     String utility class.
@@ -131,9 +133,13 @@ static bool RegionMatches(string value, int thisStart, string other, int start,
 
             if (0u >= (uint)length) { return true; }
 
+#if NET
+            return value.AsSpan().Slice(thisStart, length).SequenceEqual(other.AsSpan().Slice(start, length));
+#else
             ref char valueStart = ref MemoryMarshal.GetReference(value.AsSpan());
             ref char otherStart = ref MemoryMarshal.GetReference(other.AsSpan());
             return SpanHelpers.SequenceEqual(ref Unsafe.Add(ref valueStart, thisStart), ref Unsafe.Add(ref otherStart, start), length);
+#endif
         }
 
         /// <summary>
diff --git a/test/DotNetty.Buffers.ReaderWriter.Tests/DotNetty.Buffers.ReaderWriter.Tests.csproj b/test/DotNetty.Buffers.ReaderWriter.Tests/DotNetty.Buffers.ReaderWriter.Tests.csproj
index ae6b3839c..73f6f5f1e 100644
--- a/test/DotNetty.Buffers.ReaderWriter.Tests/DotNetty.Buffers.ReaderWriter.Tests.csproj
+++ b/test/DotNetty.Buffers.ReaderWriter.Tests/DotNetty.Buffers.ReaderWriter.Tests.csproj
@@ -1,7 +1,7 @@
 ﻿<Project Sdk="Microsoft.NET.Sdk" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 
   <PropertyGroup>
-    <TargetFrameworks>netcoreapp3.1;netcoreapp2.1</TargetFrameworks>
+    <TargetFrameworks>net5.0;netcoreapp3.1;netcoreapp2.1</TargetFrameworks>
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   </PropertyGroup>
 
diff --git a/test/DotNetty.Buffers.ReaderWriter.Tests/test_corefx/BasicTests.cs b/test/DotNetty.Buffers.ReaderWriter.Tests/test_corefx/BasicTests.cs
index fd35ad23a..cc981b1f2 100644
--- a/test/DotNetty.Buffers.ReaderWriter.Tests/test_corefx/BasicTests.cs
+++ b/test/DotNetty.Buffers.ReaderWriter.Tests/test_corefx/BasicTests.cs
@@ -98,6 +98,7 @@ public void DefaultState()
             ByteBufferReader reader = default;
             Assert.Equal(0, reader.CurrentSpan.Length);
             Assert.Equal(0, reader.UnreadSpan.Length);
+            Assert.Equal(0, reader.UnreadSequence.Length);
             Assert.Equal(0, reader.Consumed);
             Assert.Equal(0, reader.CurrentSpanIndex);
             Assert.Equal(0, reader.Length);
@@ -114,7 +115,9 @@ public void DefaultState()
             Assert.True(sequence.IsEmpty);
             Assert.False(reader.TryReadTo(out sequence, array));
             Assert.True(sequence.IsEmpty);
-            Assert.False(reader.TryReadTo(out ReadOnlySpan<byte> span, default));
+            Assert.False(reader.TryReadTo(out ReadOnlySpan<byte> span, default(byte)));
+            Assert.True(span.IsEmpty);
+            Assert.False(reader.TryReadTo(out span, array));
             Assert.True(span.IsEmpty);
             Assert.False(reader.TryReadToAny(out sequence, array));
             Assert.True(sequence.IsEmpty);
@@ -124,6 +127,7 @@ public void DefaultState()
             Assert.False(reader.TryAdvanceToAny(array));
             Assert.Equal(0, reader.CurrentSpan.Length);
             Assert.Equal(0, reader.UnreadSpan.Length);
+            Assert.Equal(0, reader.UnreadSequence.Length);
             Assert.Equal(0, reader.Consumed);
             Assert.Equal(0, reader.CurrentSpanIndex);
             Assert.Equal(0, reader.Length);
@@ -168,6 +172,138 @@ public void TryPeekReturnsWithoutMoving()
             Assert.Equal(2, reader.Remaining);
         }
 
+        [Fact]
+        public void TryPeekOffset()
+        {
+            ByteBufferReader reader = new ByteBufferReader(Factory.CreateWithContent(GetInputData(10)));
+            Assert.True(reader.TryRead(out byte first));
+            Assert.Equal(InputData[0], first);
+            Assert.True(reader.TryRead(out byte second));
+            Assert.Equal(InputData[1], second);
+
+            Assert.True(reader.TryPeek(7, out byte value));
+            Assert.Equal(InputData[9], value);
+
+            Assert.False(reader.TryPeek(8, out byte defaultValue));
+            Assert.Equal(default, defaultValue);
+
+            Assert.Equal(2, reader.Consumed);
+            Assert.Equal(8, reader.Remaining);
+        }
+
+        [Fact]
+        public void TryPeekOffset_AfterEnd()
+        {
+            ByteBufferReader reader = new ByteBufferReader(Factory.CreateWithContent(GetInputData(2)));
+            Assert.True(reader.TryRead(out byte first));
+            Assert.Equal(InputData[0], first);
+
+            Assert.True(reader.TryPeek(0, out byte value));
+            Assert.Equal(InputData[1], value);
+            Assert.Equal(1, reader.Remaining);
+
+            Assert.False(reader.TryPeek(1, out byte defaultValue));
+            Assert.Equal(default, defaultValue);
+        }
+
+        [Fact]
+        public void TryPeekOffset_RemainsZeroOffsetZero()
+        {
+            ByteBufferReader reader = new ByteBufferReader(Factory.CreateWithContent(GetInputData(1)));
+            Assert.True(reader.TryRead(out byte first));
+            Assert.Equal(InputData[0], first);
+            Assert.Equal(0, reader.Remaining);
+            Assert.False(reader.TryPeek(0, out byte defaultValue));
+            Assert.Equal(default, defaultValue);
+        }
+
+        [Fact]
+        public void TryPeekOffset_Empty()
+        {
+            ByteBufferReader reader = new ByteBufferReader(Factory.CreateWithContent(GetInputData(0)));
+            Assert.False(reader.TryPeek(0, out byte defaultValue));
+            Assert.Equal(default, defaultValue);
+        }
+
+        [Fact]
+        public void TryPeekOffset_MultiSegment_StarAhead()
+        {
+            ReadOnlySpan<byte> data = (byte[])_inputData.Clone();
+
+            SequenceSegment<byte> last = new SequenceSegment<byte>();
+            last.SetMemory(new OwnedArray<byte>(data.Slice(5).ToArray()), 0, 5);
+
+            SequenceSegment<byte> first = new SequenceSegment<byte>();
+            first.SetMemory(new OwnedArray<byte>(data.Slice(0, 5).ToArray()), 0, 5);
+            first.SetNext(last);
+
+            ReadOnlySequence<byte> sequence = new ReadOnlySequence<byte>(first, first.Start, last, last.End);
+            ByteBufferReader reader = new ByteBufferReader(sequence);
+
+            // Move by 2 element
+            for (int i = 0; i < 2; i++)
+            {
+                Assert.True(reader.TryRead(out byte val));
+                Assert.Equal(InputData[i], val);
+            }
+
+            // We're on element 3 we peek last element of first segment
+            Assert.True(reader.TryPeek(2, out byte lastElementFirstSegment));
+            Assert.Equal(InputData[4], lastElementFirstSegment);
+
+            // We're on element 3 we peek first element of first segment
+            Assert.True(reader.TryPeek(3, out byte fistElementSecondSegment));
+            Assert.Equal(InputData[5], fistElementSecondSegment);
+
+            // We're on element 3 we peek last element of second segment
+            Assert.True(reader.TryPeek(7, out byte lastElementSecondSegment));
+            Assert.Equal(InputData[9], lastElementSecondSegment);
+
+            // 3 + 8 out of bounds
+            Assert.False(reader.TryPeek(8, out byte defaultValue));
+            Assert.Equal(default, defaultValue);
+
+            Assert.Equal(2, reader.Consumed);
+            Assert.Equal(8, reader.Remaining);
+        }
+
+        [Fact]
+        public void TryPeekOffset_MultiSegment_GetFirstGetLast()
+        {
+            ReadOnlySpan<byte> data = (byte[])_inputData.Clone();
+
+            SequenceSegment<byte> last = new SequenceSegment<byte>();
+            last.SetMemory(new OwnedArray<byte>(data.Slice(5).ToArray()), 0, 5);
+
+            SequenceSegment<byte> first = new SequenceSegment<byte>();
+            first.SetMemory(new OwnedArray<byte>(data.Slice(0, 5).ToArray()), 0, 5);
+            first.SetNext(last);
+
+            ReadOnlySequence<byte> sequence = new ReadOnlySequence<byte>(first, first.Start, last, last.End);
+            ByteBufferReader reader = new ByteBufferReader(sequence);
+
+            Assert.True(reader.TryPeek(0, out byte firstElement));
+            Assert.Equal(InputData[0], firstElement);
+
+            Assert.True(reader.TryPeek(data.Length - 1, out byte lastElemen));
+            Assert.Equal(InputData[data.Length - 1], lastElemen);
+
+            Assert.Equal(0, reader.Consumed);
+            Assert.Equal(10, reader.Remaining);
+        }
+
+        [Fact]
+        public void TryPeekOffset_InvalidOffset()
+        {
+            ArgumentOutOfRangeException exception = Assert.Throws<ArgumentOutOfRangeException>(() =>
+            {
+                ByteBufferReader reader = new ByteBufferReader(Factory.CreateWithContent(GetInputData(10)));
+                reader.TryPeek(-1, out _);
+            });
+
+            Assert.Equal("offset", exception.ParamName);
+        }
+
         [Fact]
         public void CursorIsCorrectAtEnd()
         {
@@ -493,6 +629,209 @@ public void AdvanceTo_AdvancePast()
             }
         }
 
+        [Fact]
+        public void AdvanceTo_End()
+        {
+            ReadOnlySpan<byte> data = (byte[])_inputData.Clone();
+
+            SequenceSegment<byte> last = new SequenceSegment<byte>();
+            last.SetMemory(new OwnedArray<byte>(data.Slice(5).ToArray()), 0, 5);
+
+            SequenceSegment<byte> first = new SequenceSegment<byte>();
+            first.SetMemory(new OwnedArray<byte>(data.Slice(0, 5).ToArray()), 0, 5);
+            first.SetNext(last);
+
+            ReadOnlySequence<byte> sequence = new ReadOnlySequence<byte>(first, first.Start, last, last.End);
+            ByteBufferReader reader = new ByteBufferReader(sequence);
+
+            reader.AdvanceToEnd();
+
+            Assert.Equal(data.Length, reader.Length);
+            Assert.Equal(data.Length, reader.Consumed);
+            Assert.Equal(reader.Length, reader.Consumed);
+            Assert.True(reader.End);
+            Assert.Equal(0, reader.CurrentSpanIndex);
+            Assert.Equal(sequence.End, reader.Position);
+            Assert.Equal(0, reader.Remaining);
+            Assert.True(default == reader.UnreadSpan);
+            Assert.True(default == reader.CurrentSpan);
+        }
+
+        [Fact]
+        public void AdvanceTo_End_EmptySegment()
+        {
+            ReadOnlySpan<byte> data = (byte[])_inputData.Clone();
+
+            // Empty segment
+            SequenceSegment<byte> third = new SequenceSegment<byte>();
+
+            SequenceSegment<byte> second = new SequenceSegment<byte>();
+            second.SetMemory(new OwnedArray<byte>(data.Slice(5).ToArray()), 0, 5);
+            second.SetNext(third);
+
+            SequenceSegment<byte> first = new SequenceSegment<byte>();
+            first.SetMemory(new OwnedArray<byte>(data.Slice(0, 5).ToArray()), 0, 5);
+            first.SetNext(second);
+
+            ReadOnlySequence<byte> sequence = new ReadOnlySequence<byte>(first, first.Start, third, third.End);
+            ByteBufferReader reader = new ByteBufferReader(sequence);
+
+            reader.AdvanceToEnd();
+
+            Assert.Equal(first.Length + second.Length, reader.Length);
+            Assert.Equal(first.Length + second.Length, reader.Consumed);
+            Assert.Equal(reader.Length, reader.Consumed);
+            Assert.True(reader.End);
+            Assert.Equal(0, reader.CurrentSpanIndex);
+            Assert.Equal(sequence.End, reader.Position);
+            Assert.Equal(0, reader.Remaining);
+            Assert.True(default == reader.UnreadSpan);
+            Assert.True(default == reader.CurrentSpan);
+        }
+
+        [Fact]
+        public void AdvanceTo_End_Rewind_Advance()
+        {
+            ReadOnlySpan<byte> data = (byte[])_inputData.Clone();
+
+            SequenceSegment<byte> last = new SequenceSegment<byte>();
+            last.SetMemory(new OwnedArray<byte>(data.Slice(5).ToArray()), 0, 5);
+
+            SequenceSegment<byte> first = new SequenceSegment<byte>();
+            first.SetMemory(new OwnedArray<byte>(data.Slice(0, 5).ToArray()), 0, 5);
+            first.SetNext(last);
+
+            ReadOnlySequence<byte> sequence = new ReadOnlySequence<byte>(first, first.Start, last, last.End);
+            ByteBufferReader reader = new ByteBufferReader(sequence);
+
+            reader.AdvanceToEnd();
+
+            Assert.Equal(data.Length, reader.Length);
+            Assert.Equal(data.Length, reader.Consumed);
+            Assert.Equal(reader.Length, reader.Consumed);
+            Assert.True(reader.End);
+            Assert.Equal(0, reader.CurrentSpanIndex);
+            Assert.Equal(sequence.End, reader.Position);
+            Assert.Equal(0, reader.Remaining);
+            Assert.True(default == reader.UnreadSpan);
+            Assert.True(default == reader.CurrentSpan);
+
+            // Rewind to second element
+            reader.Rewind(9);
+
+            Assert.Equal(1, reader.Consumed);
+            Assert.False(reader.End);
+            Assert.Equal(1, reader.CurrentSpanIndex);
+            Assert.Equal(9, reader.Remaining);
+            Assert.Equal(sequence.Slice(1), reader.UnreadSequence);
+
+            // Consume next five elements and stop at second element of second segment
+            reader.Advance(5);
+
+            Assert.Equal(6, reader.Consumed);
+            Assert.False(reader.End);
+            Assert.Equal(1, reader.CurrentSpanIndex);
+            Assert.Equal(4, reader.Remaining);
+            Assert.Equal(sequence.Slice(6), reader.UnreadSequence);
+
+            reader.AdvanceToEnd();
+
+            Assert.Equal(data.Length, reader.Length);
+            Assert.Equal(data.Length, reader.Consumed);
+            Assert.Equal(reader.Length, reader.Consumed);
+            Assert.True(reader.End);
+            Assert.Equal(0, reader.CurrentSpanIndex);
+            Assert.Equal(sequence.End, reader.Position);
+            Assert.Equal(0, reader.Remaining);
+            Assert.True(default == reader.UnreadSpan);
+            Assert.True(default == reader.CurrentSpan);
+        }
+
+        [Fact]
+        public void AdvanceTo_End_Multiple()
+        {
+            ReadOnlySpan<byte> data = (byte[])_inputData.Clone();
+
+            SequenceSegment<byte> last = new SequenceSegment<byte>();
+            last.SetMemory(new OwnedArray<byte>(data.Slice(5).ToArray()), 0, 5);
+
+            SequenceSegment<byte> first = new SequenceSegment<byte>();
+            first.SetMemory(new OwnedArray<byte>(data.Slice(0, 5).ToArray()), 0, 5);
+            first.SetNext(last);
+
+            ReadOnlySequence<byte> sequence = new ReadOnlySequence<byte>(first, first.Start, last, last.End);
+            ByteBufferReader reader = new ByteBufferReader(sequence);
+
+            reader.AdvanceToEnd();
+            reader.AdvanceToEnd();
+            reader.AdvanceToEnd();
+
+            Assert.Equal(data.Length, reader.Length);
+            Assert.Equal(data.Length, reader.Consumed);
+            Assert.Equal(reader.Length, reader.Consumed);
+            Assert.True(reader.End);
+            Assert.Equal(0, reader.CurrentSpanIndex);
+            Assert.Equal(sequence.End, reader.Position);
+            Assert.Equal(0, reader.Remaining);
+            Assert.True(default == reader.UnreadSpan);
+            Assert.True(default == reader.CurrentSpan);
+        }
+
+        [Fact]
+        public void UnreadSequence()
+        {
+            ReadOnlySpan<byte> data = (byte[])_inputData.Clone();
+
+            SequenceSegment<byte> last = new SequenceSegment<byte>();
+            last.SetMemory(new OwnedArray<byte>(data.Slice(5).ToArray()), 0, 5);
+
+            SequenceSegment<byte> first = new SequenceSegment<byte>();
+            first.SetMemory(new OwnedArray<byte>(data.Slice(0, 5).ToArray()), 0, 5);
+            first.SetNext(last);
+
+            ReadOnlySequence<byte> sequence = new ReadOnlySequence<byte>(first, first.Start, last, last.End);
+            ByteBufferReader reader = new ByteBufferReader(sequence);
+
+            Assert.Equal(sequence, reader.UnreadSequence);
+            Assert.Equal(data.Length, reader.UnreadSequence.Length);
+            Assert.True(reader.TryRead(out byte _));
+            Assert.True(reader.TryRead(out byte _));
+            Assert.Equal(sequence.Slice(2), reader.UnreadSequence);
+            // Advance to the end
+            reader.Advance(8);
+            Assert.Equal(0, reader.UnreadSequence.Length);
+        }
+
+        [Fact]
+        public void UnreadSequence_EmptySegment()
+        {
+            ReadOnlySpan<byte> data = (byte[])_inputData.Clone();
+
+            // Empty segment
+            SequenceSegment<byte> third = new SequenceSegment<byte>();
+
+            SequenceSegment<byte> second = new SequenceSegment<byte>();
+            second.SetMemory(new OwnedArray<byte>(data.Slice(5).ToArray()), 0, 5);
+            second.SetNext(third);
+
+            SequenceSegment<byte> first = new SequenceSegment<byte>();
+            first.SetMemory(new OwnedArray<byte>(data.Slice(0, 5).ToArray()), 0, 5);
+            first.SetNext(second);
+
+            ReadOnlySequence<byte> sequence = new ReadOnlySequence<byte>(first, first.Start, third, third.End);
+            ByteBufferReader reader = new ByteBufferReader(sequence);
+
+            // Drain until the expected end of data with simple read
+            for (int i = 0; i < data.Length; i++)
+            {
+                reader.TryRead(out byte _);
+            }
+
+            Assert.Equal(sequence.Slice(data.Length), reader.UnreadSequence);
+            Assert.Equal(0, reader.UnreadSequence.Length);
+            Assert.False(reader.TryRead(out byte _));
+        }
+
         [Fact]
         public void CopyToSmallerBufferWorks()
         {
diff --git a/test/DotNetty.Buffers.ReaderWriter.Tests/test_corefx/ReadTo.cs b/test/DotNetty.Buffers.ReaderWriter.Tests/test_corefx/ReadTo.cs
index b1800fe50..7ecea18bd 100644
--- a/test/DotNetty.Buffers.ReaderWriter.Tests/test_corefx/ReadTo.cs
+++ b/test/DotNetty.Buffers.ReaderWriter.Tests/test_corefx/ReadTo.cs
@@ -121,7 +121,7 @@ public void TryReadToSpan_Sequence(bool advancePastDelimiter)
                 new byte[] { 3, 3, 4, 4, 5, 5, 6, 6 }
             });
 
-            ByteBufferReader reader = new ByteBufferReader(bytes);
+            ByteBufferReader baseReader = new ByteBufferReader(bytes);
             for (byte i = 0; i < bytes.Length / 2 - 1; i++)
             {
                 byte[] expected = new byte[i * 2 + 1];
@@ -131,7 +131,12 @@ public void TryReadToSpan_Sequence(bool advancePastDelimiter)
                 }
                 expected[i * 2] = i;
                 ReadOnlySpan<byte> searchFor = new byte[] { i, (byte)(i + 1) };
-                ByteBufferReader copy = reader;
+                ByteBufferReader copy = baseReader;
+
+                Assert.True(copy.TryReadTo(out ReadOnlySpan<byte> sp, searchFor, advancePastDelimiter));
+                Assert.True(sp.SequenceEqual(expected));
+
+                copy = baseReader;
                 Assert.True(copy.TryReadTo(out ReadOnlySequence<byte> seq, searchFor, advancePastDelimiter));
                 Assert.True(seq.ToArray().AsSpan().SequenceEqual(expected));
             }
@@ -140,8 +145,14 @@ public void TryReadToSpan_Sequence(bool advancePastDelimiter)
                 new byte[] { 47, 42, 66, 32, 42, 32, 66, 42, 47 }   // /*b * b*/
             });
 
-            reader = new ByteBufferReader(bytes);
-            Assert.True(reader.TryReadTo(out ReadOnlySequence<byte> sequence, new byte[] { 42, 47 }, advancePastDelimiter));    //  */
+            baseReader = new ByteBufferReader(bytes);
+            ByteBufferReader copyReader = baseReader;
+
+            Assert.True(copyReader.TryReadTo(out ReadOnlySpan<byte> span, new byte[] { 42, 47 }, advancePastDelimiter));    //  */
+            Assert.True(span.SequenceEqual(new byte[] { 47, 42, 66, 32, 42, 32, 66 }));
+
+            copyReader = baseReader;
+            Assert.True(copyReader.TryReadTo(out ReadOnlySequence<byte> sequence, new byte[] { 42, 47 }, advancePastDelimiter));    //  */
             Assert.True(sequence.ToArray().AsSpan().SequenceEqual(new byte[] { 47, 42, 66, 32, 42, 32, 66 }));
         }
 
@@ -183,17 +194,30 @@ public void TryReadTo_SingleDelimiter()
                 new byte[] { 2, 3, 4, 5, 6 }
             });
 
-            ByteBufferReader reader = new ByteBufferReader(bytes);
+            ByteBufferReader baseReader = new ByteBufferReader(bytes);
+
+            ByteBufferReader spanReader = baseReader;
+            ByteBufferReader sequenceReader = baseReader;
             Span<byte> delimiter = new byte[] { 1 };
 
             for (int i = 1; i < 6; i += 1)
             {
                 // Also check scanning from the start.
-                ByteBufferReader resetReader = new ByteBufferReader(bytes);
+                ByteBufferReader resetReader = baseReader;
                 delimiter[0] = (byte)i;
-                Assert.True(reader.TryReadTo(out ReadOnlySequence<byte> sequence, delimiter, advancePastDelimiter: true));
+                Assert.True(spanReader.TryReadTo(out ReadOnlySpan<byte> span, delimiter, advancePastDelimiter: true));
+                Assert.True(resetReader.TryReadTo(out span, delimiter, advancePastDelimiter: true));
+                Assert.True(spanReader.TryPeek(out byte value));
+                Assert.Equal(i + 1, value);
+                Assert.True(resetReader.TryPeek(out value));
+                Assert.Equal(i + 1, value);
+
+                // Also check scanning from the start.
+                resetReader = baseReader;
+                delimiter[0] = (byte)i;
+                Assert.True(sequenceReader.TryReadTo(out ReadOnlySequence<byte> sequence, delimiter, advancePastDelimiter: true));
                 Assert.True(resetReader.TryReadTo(out sequence, delimiter, advancePastDelimiter: true));
-                Assert.True(reader.TryPeek(out byte value));
+                Assert.True(sequenceReader.TryPeek(out value));
                 Assert.Equal(i + 1, value);
                 Assert.True(resetReader.TryPeek(out value));
                 Assert.Equal(i + 1, value);
@@ -208,7 +232,9 @@ public void TryReadTo_Span_At_Segments_Boundary()
             segment.Append(Text.Encoding.ASCII.GetBytes("\nWorld")); // add next segment
             ReadOnlySequence<byte> inputSeq = new ReadOnlySequence<byte>(segment, 0, segment, 6); // span only the first segment!
             ByteBufferReader sr = new ByteBufferReader(inputSeq);
-            bool r = sr.TryReadTo(out _, delimiter);
+            bool r = sr.TryReadTo(out ReadOnlySpan<byte> _, delimiter);
+            Assert.False(r);
+            r = sr.TryReadTo(out ReadOnlySequence<byte> _, delimiter);
             Assert.False(r);
         }
     }
diff --git a/test/DotNetty.Buffers.ReaderWriter.Tests/test_corefxlab/ReaderBasicTests.cs b/test/DotNetty.Buffers.ReaderWriter.Tests/test_corefxlab/ReaderBasicTests.cs
index 414152ee0..b2dc85d57 100644
--- a/test/DotNetty.Buffers.ReaderWriter.Tests/test_corefxlab/ReaderBasicTests.cs
+++ b/test/DotNetty.Buffers.ReaderWriter.Tests/test_corefxlab/ReaderBasicTests.cs
@@ -83,7 +83,7 @@ public void DefaultState()
             Assert.True(sequence.IsEmpty);
             Assert.False(reader.TryReadTo(out sequence, array));
             Assert.True(sequence.IsEmpty);
-            Assert.False(reader.TryReadTo(out ReadOnlySpan<byte> span, default));
+            Assert.False(reader.TryReadTo(out ReadOnlySpan<byte> span, default(byte)));
             Assert.True(span.IsEmpty);
             Assert.False(reader.TryReadToAny(out sequence, array));
             Assert.True(sequence.IsEmpty);

From 9e0a7409f3449e96b2ac990c43a2ba4a188d3728 Mon Sep 17 00:00:00 2001
From: cuteant <cuteant@outlook.com>
Date: Thu, 24 Jun 2021 22:13:10 +0800
Subject: [PATCH 4/5] Using MemoryMarshal.GetArrayDataReference in .net5.0

---
 src/DotNetty.Buffers/ArrayPooledByteBuffer.cs    |  7 +++++++
 .../ArrayPooledUnsafeDirectByteBuffer.cs         | 12 +++++++++++-
 src/DotNetty.Buffers/PooledHeapByteBuffer.cs     |  8 ++++++++
 src/DotNetty.Buffers/UnpooledHeapByteBuffer.cs   |  7 +++++++
 .../UnpooledUnsafeDirectByteBuffer.cs            | 16 +++++++++++++++-
 .../Internal/PlatformDependent.cs                |  8 ++++++++
 6 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/src/DotNetty.Buffers/ArrayPooledByteBuffer.cs b/src/DotNetty.Buffers/ArrayPooledByteBuffer.cs
index e75b040b3..ff0e311b9 100644
--- a/src/DotNetty.Buffers/ArrayPooledByteBuffer.cs
+++ b/src/DotNetty.Buffers/ArrayPooledByteBuffer.cs
@@ -24,6 +24,9 @@
 using System.Buffers;
 using DotNetty.Common;
 using DotNetty.Common.Internal;
+#if NET
+using System.Runtime.InteropServices;
+#endif
 
 namespace DotNetty.Buffers
 {
@@ -186,7 +189,11 @@ public sealed override byte[] Array
         public sealed override ref byte GetPinnableMemoryAddress()
         {
             EnsureAccessible();
+#if NET
+            return ref MemoryMarshal.GetArrayDataReference(Memory);
+#else
             return ref Memory[0];
+#endif
         }
 
         public sealed override IntPtr AddressOfPinnedMemory() => IntPtr.Zero;
diff --git a/src/DotNetty.Buffers/ArrayPooledUnsafeDirectByteBuffer.cs b/src/DotNetty.Buffers/ArrayPooledUnsafeDirectByteBuffer.cs
index 49146c5cb..4c150bd70 100644
--- a/src/DotNetty.Buffers/ArrayPooledUnsafeDirectByteBuffer.cs
+++ b/src/DotNetty.Buffers/ArrayPooledUnsafeDirectByteBuffer.cs
@@ -27,6 +27,9 @@
 using System.Threading;
 using System.Threading.Tasks;
 using DotNetty.Common;
+#if NET
+using System.Runtime.InteropServices;
+#endif
 
 namespace DotNetty.Buffers
 {
@@ -248,7 +251,14 @@ public sealed override IByteBuffer Copy(int index, int length)
         }
 
         [MethodImpl(InlineMethod.AggressiveOptimization)]
-        ref byte Addr(int index) => ref Memory[index];
+        ref byte Addr(int index)
+        {
+#if NET
+            return ref Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(Memory), index);
+#else
+            return ref Memory[index];
+#endif
+        }
 
         public sealed override IByteBuffer SetZero(int index, int length)
         {
diff --git a/src/DotNetty.Buffers/PooledHeapByteBuffer.cs b/src/DotNetty.Buffers/PooledHeapByteBuffer.cs
index 78fd81e96..f3d47c279 100644
--- a/src/DotNetty.Buffers/PooledHeapByteBuffer.cs
+++ b/src/DotNetty.Buffers/PooledHeapByteBuffer.cs
@@ -31,6 +31,10 @@ namespace DotNetty.Buffers
     using System.Threading.Tasks;
     using DotNetty.Common;
     using DotNetty.Common.Internal;
+#if NET
+    using System.Runtime.CompilerServices;
+    using System.Runtime.InteropServices;
+#endif
 
     sealed partial class PooledHeapByteBuffer : PooledByteBuffer<byte[]>
     {
@@ -212,7 +216,11 @@ public sealed override byte[] Array
         public sealed override ref byte GetPinnableMemoryAddress()
         {
             EnsureAccessible();
+#if NET
+            return ref Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(Memory), Offset);
+#else
             return ref Memory[Offset];
+#endif
         }
 
         public sealed override IntPtr AddressOfPinnedMemory() => IntPtr.Zero;
diff --git a/src/DotNetty.Buffers/UnpooledHeapByteBuffer.cs b/src/DotNetty.Buffers/UnpooledHeapByteBuffer.cs
index 45a798b14..8ec1653be 100644
--- a/src/DotNetty.Buffers/UnpooledHeapByteBuffer.cs
+++ b/src/DotNetty.Buffers/UnpooledHeapByteBuffer.cs
@@ -30,6 +30,9 @@ namespace DotNetty.Buffers
     using System.Threading;
     using System.Threading.Tasks;
     using DotNetty.Common.Internal;
+#if NET
+    using System.Runtime.InteropServices;
+#endif
 
     partial class UnpooledHeapByteBuffer : AbstractReferenceCountedByteBuffer
     {
@@ -134,7 +137,11 @@ public sealed override byte[] Array
         public sealed override ref byte GetPinnableMemoryAddress()
         {
             EnsureAccessible();
+#if NET
+            return ref MemoryMarshal.GetArrayDataReference(_array);
+#else
             return ref _array[0];
+#endif
         }
 
         public sealed override IntPtr AddressOfPinnedMemory() => IntPtr.Zero;
diff --git a/src/DotNetty.Buffers/UnpooledUnsafeDirectByteBuffer.cs b/src/DotNetty.Buffers/UnpooledUnsafeDirectByteBuffer.cs
index 04d552578..dc8fb48c9 100644
--- a/src/DotNetty.Buffers/UnpooledUnsafeDirectByteBuffer.cs
+++ b/src/DotNetty.Buffers/UnpooledUnsafeDirectByteBuffer.cs
@@ -31,6 +31,9 @@ namespace DotNetty.Buffers
     using System.Threading;
     using System.Threading.Tasks;
     using DotNetty.Common.Internal;
+#if NET
+    using System.Runtime.InteropServices;
+#endif
 
     unsafe partial class UnpooledUnsafeDirectByteBuffer : AbstractReferenceCountedByteBuffer
     {
@@ -190,7 +193,11 @@ protected internal sealed override void Deallocate()
         public sealed override ref byte GetPinnableMemoryAddress()
         {
             EnsureAccessible();
+#if NET
+            return ref MemoryMarshal.GetArrayDataReference(_buffer);
+#else
             return ref _buffer[0];
+#endif
         }
 
         public sealed override bool IsContiguous => true;
@@ -393,7 +400,14 @@ public sealed override IByteBuffer Copy(int index, int length)
         }
 
         [MethodImpl(InlineMethod.AggressiveOptimization)]
-        ref byte Addr(int index) => ref _buffer[index];
+        ref byte Addr(int index)
+        {
+#if NET
+            return ref Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(_buffer), index);
+#else
+            return ref _buffer[index];
+#endif
+        }
 
         public sealed override IByteBuffer SetZero(int index, int length)
         {
diff --git a/src/DotNetty.Common/Internal/PlatformDependent.cs b/src/DotNetty.Common/Internal/PlatformDependent.cs
index c56552e62..01d4ae25b 100644
--- a/src/DotNetty.Common/Internal/PlatformDependent.cs
+++ b/src/DotNetty.Common/Internal/PlatformDependent.cs
@@ -12,6 +12,9 @@ namespace DotNetty.Common.Internal
     using System.Threading;
     using DotNetty.Common.Internal.Logging;
     using DotNetty.Common.Utilities;
+#if NET
+    using System.Runtime.InteropServices;
+#endif
 
     using static PlatformDependent0;
 
@@ -262,6 +265,11 @@ public static void CopyMemory(byte[] src, int srcIndex, byte[] dst, int dstIndex
                     }
                 }
             }
+#elif NET
+            Unsafe.CopyBlockUnaligned(
+                ref Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(dst), dstIndex), 
+                ref Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(src), srcIndex), 
+                nlen);
 #else
             Unsafe.CopyBlockUnaligned(ref dst[dstIndex], ref src[srcIndex], nlen);
 #endif

From e55c9b9d9e2e282280435a06578fd18bbae9ba52 Mon Sep 17 00:00:00 2001
From: cuteant <cuteant@outlook.com>
Date: Thu, 24 Jun 2021 23:00:15 +0800
Subject: [PATCH 5/5] added MyGet package info in readme

---
 README.md        | 24 ++++++++++++------------
 localPublish.cmd |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index a83390243..20f9b2f6c 100644
--- a/README.md
+++ b/README.md
@@ -30,18 +30,18 @@ This is a fork of [DotNetty](https://github.com/azure/dotnetty).
 * Nightly builds are available on [MyGet](https://www.myget.org/F/cuteant/api/v2).
 
 
-|NuGet Package|Status|
-|------|-------------|
-|SpanNetty.Common|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Common)](https://www.nuget.org/packages/SpanNetty.Common/)|
-|SpanNetty.Buffers|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Buffers)](https://www.nuget.org/packages/SpanNetty.Buffers/)|
-|SpanNetty.Codecs|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Codecs)](https://www.nuget.org/packages/SpanNetty.Codecs/)|
-|SpanNetty.Codecs.Http|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Codecs.Http)](https://www.nuget.org/packages/SpanNetty.Codecs.Http/)|
-|SpanNetty.Codecs.Http2|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Codecs.Http2)](https://www.nuget.org/packages/SpanNetty.Codecs.Http2/)|
-|SpanNetty.Codecs.Mqtt|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Codecs.Mqtt)](https://www.nuget.org/packages/SpanNetty.Codecs.Mqtt/)|
-|SpanNetty.Codecs.Protobuf|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Codecs.Protobuf)](https://www.nuget.org/packages/SpanNetty.Codecs.Protobuf/)|
-|SpanNetty.Handlers|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Handlers)](https://www.nuget.org/packages/SpanNetty.Handlers/)|
-|SpanNetty.Transport|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Transport)](https://www.nuget.org/packages/SpanNetty.Transport/)|
-|SpanNetty.Transport.Libuv|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Transport.Libuv)](https://www.nuget.org/packages/SpanNetty.Transport.Libuv/)|
+|Package|NuGet Version|MyGet Version|
+|------|-------------|-------------|
+|SpanNetty.Common|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Common)](https://www.nuget.org/packages/SpanNetty.Common/)|[![MyGet Version](https://img.shields.io/myget/cuteant/vpre/SpanNetty.Common)](https://www.myget.org/feed/cuteant/package/nuget/SpanNetty.Common)|
+|SpanNetty.Buffers|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Buffers)](https://www.nuget.org/packages/SpanNetty.Buffers/)|[![MyGet Version](https://img.shields.io/myget/cuteant/vpre/SpanNetty.Buffers)](https://www.myget.org/feed/cuteant/package/nuget/SpanNetty.Buffers)|
+|SpanNetty.Codecs|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Codecs)](https://www.nuget.org/packages/SpanNetty.Codecs/)|[![MyGet Version](https://img.shields.io/myget/cuteant/vpre/SpanNetty.Codecs)](https://www.myget.org/feed/cuteant/package/nuget/SpanNetty.Codecs)|
+|SpanNetty.Codecs.Http|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Codecs.Http)](https://www.nuget.org/packages/SpanNetty.Codecs.Http/)|[![MyGet Version](https://img.shields.io/myget/cuteant/vpre/SpanNetty.Codecs.Http)](https://www.myget.org/feed/cuteant/package/nuget/SpanNetty.Codecs.Http)|
+|SpanNetty.Codecs.Http2|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Codecs.Http2)](https://www.nuget.org/packages/SpanNetty.Codecs.Http2/)|[![MyGet Version](https://img.shields.io/myget/cuteant/vpre/SpanNetty.Codecs.Http2)](https://www.myget.org/feed/cuteant/package/nuget/SpanNetty.Codecs.Http2)|
+|SpanNetty.Codecs.Mqtt|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Codecs.Mqtt)](https://www.nuget.org/packages/SpanNetty.Codecs.Mqtt/)|[![MyGet Version](https://img.shields.io/myget/cuteant/vpre/SpanNetty.Codecs.Mqtt)](https://www.myget.org/feed/cuteant/package/nuget/SpanNetty.Codecs.Mqtt)|
+|SpanNetty.Codecs.Protobuf|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Codecs.Protobuf)](https://www.nuget.org/packages/SpanNetty.Codecs.Protobuf/)|[![MyGet Version](https://img.shields.io/myget/cuteant/vpre/SpanNetty.Codecs.Protobuf)](https://www.myget.org/feed/cuteant/package/nuget/SpanNetty.Codecs.Protobuf)|
+|SpanNetty.Handlers|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Handlers)](https://www.nuget.org/packages/SpanNetty.Handlers/)|[![MyGet Version](https://img.shields.io/myget/cuteant/vpre/SpanNetty.Handlers)](https://www.myget.org/feed/cuteant/package/nuget/SpanNetty.Handlers)|
+|SpanNetty.Transport|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Transport)](https://www.nuget.org/packages/SpanNetty.Transport/)|[![MyGet Version](https://img.shields.io/myget/cuteant/vpre/SpanNetty.Transport)](https://www.myget.org/feed/cuteant/package/nuget/SpanNetty.Transport)|
+|SpanNetty.Transport.Libuv|[![NuGet Version and Downloads count](https://buildstats.info/nuget/SpanNetty.Transport.Libuv)](https://www.nuget.org/packages/SpanNetty.Transport.Libuv/)|[![MyGet Version](https://img.shields.io/myget/cuteant/vpre/SpanNetty.Transport.Libuv)](https://www.myget.org/feed/cuteant/package/nuget/SpanNetty.Transport.Libuv)|
 
 ## Performance
 
diff --git a/localPublish.cmd b/localPublish.cmd
index b79f592cf..09e1d5cf2 100644
--- a/localPublish.cmd
+++ b/localPublish.cmd
@@ -18,7 +18,7 @@ call Ensure-DotNetSdk.cmd
 SET SOLUTION=%CMDHOME%\DotNetty.CrossPlatform.sln
 
 :: Set DateTime prefix or suffix for builds
-if "%PublishConfiguration%" == "dev" for /f %%j in ('powershell -NoProfile -ExecutionPolicy ByPass Get-Date -format "{yyMMddHHmm}"') do set DATE_SUFFIX=%%j
+if "%PublishConfiguration%" == "dev" for /f %%j in ('powershell -NoProfile -ExecutionPolicy ByPass Get-Date -format "{yyMMdd}"') do set DATE_SUFFIX=%%j
 if "%PublishConfiguration%" == "dev" SET AdditionalConfigurationProperties=;VersionDateSuffix=%DATE_SUFFIX%
 if "%PublishConfiguration%" == "release" for /f %%j in ('powershell -NoProfile -ExecutionPolicy ByPass Get-Date -format "{yyMM}"') do set YEAR_PREFIX=%%j
 if "%PublishConfiguration%" == "release" for /f %%j in ('powershell -NoProfile -ExecutionPolicy ByPass Get-Date -format "{ddHH}"') do set DATE_PREFIX=%%j