diff --git a/Microsoft.Toolkit.HighPerformance/Helpers/Internals/SpanHelper.Count.cs b/Microsoft.Toolkit.HighPerformance/Helpers/Internals/SpanHelper.Count.cs index 408b3073cf1..07277ab0fd6 100644 --- a/Microsoft.Toolkit.HighPerformance/Helpers/Internals/SpanHelper.Count.cs +++ b/Microsoft.Toolkit.HighPerformance/Helpers/Internals/SpanHelper.Count.cs @@ -79,9 +79,6 @@ public static nint Count(ref T r0, nint length, T value) /// Implements with a sequential search. /// [Pure] -#if NETCOREAPP3_1 - [MethodImpl(MethodImplOptions.AggressiveOptimization)] -#endif private static nint CountSequential(ref T r0, nint length, T value) where T : IEquatable { @@ -132,9 +129,6 @@ private static nint CountSequential(ref T r0, nint length, T value) /// Implements with a vectorized search. /// [Pure] -#if NETCOREAPP3_1 - [MethodImpl(MethodImplOptions.AggressiveOptimization)] -#endif private static nint CountSimd(ref T r0, nint length, T value) where T : unmanaged, IEquatable { @@ -161,6 +155,67 @@ private static nint CountSimd(ref T r0, nint length, T value) var partials = Vector.Zero; + // Unrolled vectorized loop, with 8 unrolled iterations. We only run this when the + // current type T is at least 2 bytes in size, otherwise the average chunk length + // would always be too small to be able to trigger the unrolled loop, and the overall + // performance would just be slightly worse due to the additional conditional branches. + if (typeof(T) != typeof(sbyte)) + { + while (chunkLength >= Vector.Count * 8) + { + ref T ri0 = ref Unsafe.Add(ref r0, offset + (Vector.Count * 0)); + var vi0 = Unsafe.As>(ref ri0); + var ve0 = Vector.Equals(vi0, vc); + + partials -= ve0; + + ref T ri1 = ref Unsafe.Add(ref r0, offset + (Vector.Count * 1)); + var vi1 = Unsafe.As>(ref ri1); + var ve1 = Vector.Equals(vi1, vc); + + partials -= ve1; + + ref T ri2 = ref Unsafe.Add(ref r0, offset + (Vector.Count * 2)); + var vi2 = Unsafe.As>(ref ri2); + var ve2 = Vector.Equals(vi2, vc); + + partials -= ve2; + + ref T ri3 = ref Unsafe.Add(ref r0, offset + (Vector.Count * 3)); + var vi3 = Unsafe.As>(ref ri3); + var ve3 = Vector.Equals(vi3, vc); + + partials -= ve3; + + ref T ri4 = ref Unsafe.Add(ref r0, offset + (Vector.Count * 4)); + var vi4 = Unsafe.As>(ref ri4); + var ve4 = Vector.Equals(vi4, vc); + + partials -= ve4; + + ref T ri5 = ref Unsafe.Add(ref r0, offset + (Vector.Count * 5)); + var vi5 = Unsafe.As>(ref ri5); + var ve5 = Vector.Equals(vi5, vc); + + partials -= ve5; + + ref T ri6 = ref Unsafe.Add(ref r0, offset + (Vector.Count * 6)); + var vi6 = Unsafe.As>(ref ri6); + var ve6 = Vector.Equals(vi6, vc); + + partials -= ve6; + + ref T ri7 = ref Unsafe.Add(ref r0, offset + (Vector.Count * 7)); + var vi7 = Unsafe.As>(ref ri7); + var ve7 = Vector.Equals(vi7, vc); + + partials -= ve7; + + chunkLength -= Vector.Count * 8; + offset += Vector.Count * 8; + } + } + while (chunkLength >= Vector.Count) { ref T ri = ref Unsafe.Add(ref r0, offset); @@ -242,28 +297,22 @@ private static nint CountSimd(ref T r0, nint length, T value) private static unsafe nint GetUpperBound() where T : unmanaged { - if (typeof(T) == typeof(byte) || - typeof(T) == typeof(sbyte) || - typeof(T) == typeof(bool)) + if (typeof(T) == typeof(sbyte)) { return sbyte.MaxValue; } - if (typeof(T) == typeof(char) || - typeof(T) == typeof(ushort) || - typeof(T) == typeof(short)) + if (typeof(T) == typeof(short)) { return short.MaxValue; } - if (typeof(T) == typeof(int) || - typeof(T) == typeof(uint)) + if (typeof(T) == typeof(int)) { return int.MaxValue; } - if (typeof(T) == typeof(long) || - typeof(T) == typeof(ulong)) + if (typeof(T) == typeof(long)) { if (sizeof(nint) == sizeof(int)) { diff --git a/Microsoft.Toolkit.HighPerformance/Helpers/Internals/SpanHelper.Hash.cs b/Microsoft.Toolkit.HighPerformance/Helpers/Internals/SpanHelper.Hash.cs index a7920734683..4287aca9306 100644 --- a/Microsoft.Toolkit.HighPerformance/Helpers/Internals/SpanHelper.Hash.cs +++ b/Microsoft.Toolkit.HighPerformance/Helpers/Internals/SpanHelper.Hash.cs @@ -21,9 +21,6 @@ internal static partial class SpanHelper /// The number of items to hash. /// The Djb2 value for the input sequence of items. [Pure] -#if NETCOREAPP3_1 - [MethodImpl(MethodImplOptions.AggressiveOptimization)] -#endif public static int GetDjb2HashCode(ref T r0, nint length) where T : notnull { @@ -87,9 +84,6 @@ public static int GetDjb2HashCode(ref T r0, nint length) /// faster than , as it can parallelize much of the workload. /// [Pure] -#if NETCOREAPP3_1 - [MethodImpl(MethodImplOptions.AggressiveOptimization)] -#endif public static unsafe int GetDjb2LikeByteHash(ref byte r0, nint length) { int hash = 5381;