From 9f15f720535add49e7361f8066e18e70bd94eb2a Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Wed, 29 Mar 2023 18:37:41 +0200 Subject: [PATCH 1/8] Improve writing of lower vector part in ascii convertion * from 10 /17 to 1 instruction for 64/32 bit x86 --- .../src/System/Text/Ascii.Utility.cs | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 2d2ec7a381d03..0007ae4251633 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -1394,6 +1394,19 @@ private static bool VectorContainsNonAsciiChar(Vector128 asciiVector) } } + /// + /// Stores to lower 64bits of to memory destination of [] + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void StoreLower(Vector128 byteVector, ref byte bytePtr, nuint elementOffset) + { + // GetLower().StoreUnsafe is quite inneficient on x86, below code make sure that the store is a single instruction instead of ~10 and stack spills + if (Sse2.IsSupported) + Sse2.StoreScalar((long*)((byte*)Unsafe.AsPointer(ref bytePtr) + elementOffset), byteVector.AsInt64()); + else + byteVector.GetLower().StoreUnsafe(ref bytePtr, elementOffset); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool VectorContainsNonAsciiChar(Vector128 utf16Vector) { @@ -1484,7 +1497,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, ref byte asciiBuffer = ref *pAsciiBuffer; Vector128 asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); - asciiVector.GetLower().StoreUnsafe(ref asciiBuffer); + StoreLower(asciiVector, ref asciiBuffer, 0); nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far // We're going to get the best performance when we have aligned writes, so we'll take the @@ -1511,7 +1524,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination. asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); - asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements); + StoreLower(asciiVector, ref asciiBuffer, currentOffsetInElements); } // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment @@ -1564,7 +1577,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned."); asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); - asciiVector.GetLower().StoreUnsafe(ref asciiBuffer, currentOffsetInElements); + StoreLower(asciiVector, ref asciiBuffer, currentOffsetInElements); currentOffsetInElements += SizeOfVector128 / 2; goto Finish; From 64fca83160183f67bd351f03678600a07382ff2d Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Mon, 24 Apr 2023 17:20:36 +0200 Subject: [PATCH 2/8] Add [MethodImpl(MethodImplOptions.AggressiveInlining)] to NarrowUtf16ToAscii_Intrinsified --- .../System.Private.CoreLib/src/System/Text/Ascii.Utility.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 0007ae4251633..15d9d4bcb4629 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -1465,6 +1465,7 @@ private static Vector128 ExtractAsciiVector(Vector128 vectorFirst, } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount) { // This method contains logic optimized using vector instructions for both x64 and Arm64. From c397631c638c0130d7a5178ab919550e4c6c5638 Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Tue, 2 May 2023 11:17:29 +0200 Subject: [PATCH 3/8] rewrite StoreLower without Sse2.StoreScalar --- .../src/System/Text/Ascii.Utility.cs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 15d9d4bcb4629..ef409ced718a8 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -1400,11 +1400,9 @@ private static bool VectorContainsNonAsciiChar(Vector128 asciiVector) [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void StoreLower(Vector128 byteVector, ref byte bytePtr, nuint elementOffset) { - // GetLower().StoreUnsafe is quite inneficient on x86, below code make sure that the store is a single instruction instead of ~10 and stack spills - if (Sse2.IsSupported) - Sse2.StoreScalar((long*)((byte*)Unsafe.AsPointer(ref bytePtr) + elementOffset), byteVector.AsInt64()); - else - byteVector.GetLower().StoreUnsafe(ref bytePtr, elementOffset); + // Below code translates to a single write on x86 (for both 32 and 64 bit) + // - we use double instead of long so that the JIT writes directly to memory without intermediate (register or stack in case of 32 bit) + Unsafe.WriteUnaligned(ref Unsafe.Add(ref bytePtr, elementOffset), byteVector.AsDouble().ToScalar()); } [MethodImpl(MethodImplOptions.AggressiveInlining)] From 1103134ce0530d6b9013b5d31f8b3d5b10260f80 Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Sun, 7 May 2023 10:45:32 +0200 Subject: [PATCH 4/8] update comment --- .../System.Private.CoreLib/src/System/Text/Ascii.Utility.cs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index ef409ced718a8..07bfdfc949397 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -1397,11 +1397,12 @@ private static bool VectorContainsNonAsciiChar(Vector128 asciiVector) /// /// Stores to lower 64bits of to memory destination of [] /// + /// + /// Uses double instead of long to get a single instruction instead of storing temps on general porpose register (or stack) + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void StoreLower(Vector128 byteVector, ref byte bytePtr, nuint elementOffset) { - // Below code translates to a single write on x86 (for both 32 and 64 bit) - // - we use double instead of long so that the JIT writes directly to memory without intermediate (register or stack in case of 32 bit) Unsafe.WriteUnaligned(ref Unsafe.Add(ref bytePtr, elementOffset), byteVector.AsDouble().ToScalar()); } From 70d71d0117a1e2ce5df6699c5a0fe8f9e11592ec Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Mon, 8 May 2023 08:30:43 +0200 Subject: [PATCH 5/8] move helper to Vector128 and call in case conversion --- .../src/System/Runtime/Intrinsics/Vector128.cs | 18 ++++++++++++++++++ .../src/System/Text/Ascii.CaseConversion.cs | 9 +++------ .../src/System/Text/Ascii.Utility.cs | 18 ++++-------------- 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index d1c7d5ee8a726..31aac455cd071 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -2718,6 +2718,24 @@ public static unsafe void StoreAligned(this Vector128 source, T* destinati public static unsafe void StoreAlignedNonTemporal(this Vector128 source, T* destination) where T : unmanaged => source.StoreAligned(destination); + /// + /// Stores to lower 64 bits of to memory destination of [] + /// + /// The type of the elements in the vector. + /// The vector that will be stored. + /// The destination to which will be added before the vector will be stored. + /// The element offset from from which the vector will be stored. + /// + /// Uses double instead of long to get a single instruction instead of storing temps on general porpose register (or stack) + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void StoreLowerUnsafe(this Vector128 source, ref T destination, nuint elementOffset = 0) + where T : struct + { + ref byte address = ref Unsafe.As(ref Unsafe.Add(ref destination, elementOffset)); + Unsafe.WriteUnaligned(ref address, source.AsDouble().ToScalar()); + } + /// Stores a vector at the given destination. /// The type of the elements in the vector. /// The vector that will be stored. diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.CaseConversion.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.CaseConversion.cs index 9fa47f66fbded..37d6a91b96c5b 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.CaseConversion.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.CaseConversion.cs @@ -548,12 +548,9 @@ private static unsafe void ChangeWidthAndWriteTo(Vector128 ve } else if (sizeof(TFrom) == 2 && sizeof(TTo) == 1) { - // narrowing operation required - // since we know data is all-ASCII, special-case SSE2 to avoid unneeded PAND in Narrow call - Vector128 narrow = (Sse2.IsSupported) - ? Sse2.PackUnsignedSaturate(vector.AsInt16(), vector.AsInt16()) - : Vector128.Narrow(vector.AsUInt16(), vector.AsUInt16()); - narrow.GetLower().StoreUnsafe(ref *(byte*)pDest, elementOffset); + // narrowing operation required, we know data is all-ASCII so use extract helper + Vector128 narrow = ExtractAsciiVector(vector.AsUInt16(), vector.AsUInt16()); + narrow.StoreLowerUnsafe(ref *(byte*)pDest, elementOffset); } else { diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 07bfdfc949397..7621c4a48f2c6 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -1394,17 +1394,7 @@ private static bool VectorContainsNonAsciiChar(Vector128 asciiVector) } } - /// - /// Stores to lower 64bits of to memory destination of [] - /// - /// - /// Uses double instead of long to get a single instruction instead of storing temps on general porpose register (or stack) - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe void StoreLower(Vector128 byteVector, ref byte bytePtr, nuint elementOffset) - { - Unsafe.WriteUnaligned(ref Unsafe.Add(ref bytePtr, elementOffset), byteVector.AsDouble().ToScalar()); - } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool VectorContainsNonAsciiChar(Vector128 utf16Vector) @@ -1497,7 +1487,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, ref byte asciiBuffer = ref *pAsciiBuffer; Vector128 asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); - StoreLower(asciiVector, ref asciiBuffer, 0); + asciiVector.StoreLowerUnsafe(ref asciiBuffer, 0); nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far // We're going to get the best performance when we have aligned writes, so we'll take the @@ -1524,7 +1514,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination. asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); - StoreLower(asciiVector, ref asciiBuffer, currentOffsetInElements); + asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements); } // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment @@ -1577,7 +1567,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned."); asciiVector = ExtractAsciiVector(utf16VectorFirst, utf16VectorFirst); - StoreLower(asciiVector, ref asciiBuffer, currentOffsetInElements); + asciiVector.StoreLowerUnsafe(ref asciiBuffer, currentOffsetInElements); currentOffsetInElements += SizeOfVector128 / 2; goto Finish; From c560cf3d9bf46b21e84adeb9785901c69efbc215 Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Thu, 11 May 2023 14:51:02 +0200 Subject: [PATCH 6/8] Update src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs Co-authored-by: Adam Sitnik --- .../System.Private.CoreLib/src/System/Text/Ascii.Utility.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 7621c4a48f2c6..415cff28f737d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -1395,7 +1395,6 @@ private static bool VectorContainsNonAsciiChar(Vector128 asciiVector) } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool VectorContainsNonAsciiChar(Vector128 utf16Vector) { From 8411b06451f653125f699b232a3a089e9b3d2f77 Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Thu, 11 May 2023 14:54:09 +0200 Subject: [PATCH 7/8] remove unused helpers --- .../src/System/Text/Ascii.CaseConversion.cs | 35 ------------------- .../src/System/Text/Ascii.Utility.cs | 1 - 2 files changed, 36 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.CaseConversion.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.CaseConversion.cs index 37d6a91b96c5b..67595171273c9 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.CaseConversion.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.CaseConversion.cs @@ -487,41 +487,6 @@ private static unsafe bool VectorContainsAnyNonAsciiData(Vector128 vector) return false; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe void Widen8To16AndAndWriteTo(Vector128 narrowVector, char* pDest, nuint destOffset) - { - if (Vector256.IsHardwareAccelerated) - { - Vector256 wide = Vector256.WidenLower(narrowVector.ToVector256Unsafe()); - wide.StoreUnsafe(ref *(ushort*)pDest, destOffset); - } - else - { - Vector128.WidenLower(narrowVector).StoreUnsafe(ref *(ushort*)pDest, destOffset); - Vector128.WidenUpper(narrowVector).StoreUnsafe(ref *(ushort*)pDest, destOffset + 8); - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe void Narrow16To8AndAndWriteTo(Vector128 wideVector, byte* pDest, nuint destOffset) - { - Vector128 narrow = Vector128.Narrow(wideVector, wideVector); - - if (Sse2.IsSupported) - { - // MOVQ is supported even on x86, unaligned accesses allowed - Sse2.StoreScalar((ulong*)(pDest + destOffset), narrow.AsUInt64()); - } - else if (Vector64.IsHardwareAccelerated) - { - narrow.GetLower().StoreUnsafe(ref *pDest, destOffset); - } - else - { - Unsafe.WriteUnaligned(pDest + destOffset, narrow.AsUInt64().ToScalar()); - } - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void ChangeWidthAndWriteTo(Vector128 vector, TTo* pDest, nuint elementOffset) where TFrom : unmanaged diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 415cff28f737d..c6b3e935d0163 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -1394,7 +1394,6 @@ private static bool VectorContainsNonAsciiChar(Vector128 asciiVector) } } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool VectorContainsNonAsciiChar(Vector128 utf16Vector) { From 3a99d1361deada08fa6462d1184273f9b29ccfab Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Thu, 11 May 2023 14:56:51 +0200 Subject: [PATCH 8/8] remove unused methods after merge --- .../src/System/Text/Ascii.CaseConversion.cs | 54 ------------------- 1 file changed, 54 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.CaseConversion.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.CaseConversion.cs index ae5eb519f0fa2..a9cdc30f570e0 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.CaseConversion.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.CaseConversion.cs @@ -463,41 +463,6 @@ private static unsafe nuint ChangeCase(TFrom* pSrc, TTo* pD return i; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe void Widen8To16AndAndWriteTo(Vector128 narrowVector, char* pDest, nuint destOffset) - { - if (Vector256.IsHardwareAccelerated) - { - Vector256 wide = Vector256.WidenLower(narrowVector.ToVector256Unsafe()); - wide.StoreUnsafe(ref *(ushort*)pDest, destOffset); - } - else - { - Vector128.WidenLower(narrowVector).StoreUnsafe(ref *(ushort*)pDest, destOffset); - Vector128.WidenUpper(narrowVector).StoreUnsafe(ref *(ushort*)pDest, destOffset + 8); - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe void Narrow16To8AndAndWriteTo(Vector128 wideVector, byte* pDest, nuint destOffset) - { - Vector128 narrow = Vector128.Narrow(wideVector, wideVector); - - if (Sse2.IsSupported) - { - // MOVQ is supported even on x86, unaligned accesses allowed - Sse2.StoreScalar((ulong*)(pDest + destOffset), narrow.AsUInt64()); - } - else if (Vector64.IsHardwareAccelerated) - { - narrow.GetLower().StoreUnsafe(ref *pDest, destOffset); - } - else - { - Unsafe.WriteUnaligned(pDest + destOffset, narrow.AsUInt64().ToScalar()); - } - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void ChangeWidthAndWriteTo(Vector128 vector, TTo* pDest, nuint elementOffset) where TFrom : unmanaged @@ -553,25 +518,6 @@ private static unsafe Vector128 SignedLessThan(Vector128 left, Vector12 } } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe Vector128 NarrowOrWidenLowerVectorUnsigned(Vector128 vector) - where TFrom : unmanaged - where TTo : unmanaged - { - if (sizeof(TFrom) == 1 && sizeof(TTo) == 2) - { - return Vector128.WidenLower(vector.AsByte()).As(); - } - else if (sizeof(TFrom) == 2 && sizeof(TTo) == 1) - { - return Vector128.Narrow(vector.AsUInt16(), vector.AsUInt16()).As(); - } - else - { - throw new NotSupportedException(); - } - } - private struct ToUpperConversion { } private struct ToLowerConversion { } }