From 466afe6ef6b420e8466e838c02e3a749e7bb3a6e Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sun, 20 Feb 2022 14:48:54 +0300 Subject: [PATCH 1/9] Optimize vec == Vector.Zero for arm64 --- src/coreclr/jit/lowerarmarch.cpp | 87 ++++++++++++------- .../General/HwiOp/CompareVectorWithZero.cs | 56 ++++++++++++ .../HwiOp/CompareVectorWithZero.csproj | 9 ++ 3 files changed, 122 insertions(+), 30 deletions(-) create mode 100644 src/tests/JIT/HardwareIntrinsics/General/HwiOp/CompareVectorWithZero.cs create mode 100644 src/tests/JIT/HardwareIntrinsics/General/HwiOp/CompareVectorWithZero.csproj diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 433046e98a01a..1522c73ddd240 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -898,51 +898,78 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp) } } - GenTree* cmp = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, cmpIntrinsic, simdBaseJitType, simdSize); - BlockRange().InsertBefore(node, cmp); - LowerNode(cmp); + GenTree* val = nullptr; + NamedIntrinsic op2ni = op2->AsHWIntrinsic()->GetHWIntrinsicId(); - if ((simdBaseType == TYP_FLOAT) && (simdSize == 12)) + // Optimize comparison against a all-zero integer vector via UMAX, basically: + // + // bool eq = v == Vector128.Zero + // + // to: + // + // bool eq = AdvSimd.Arm64.MaxAcross(v.AsByte()).ToScalar() == 0; + // + // Assume morph made get_Zero rhs if it was lhs + if (!varTypeIsFloating(simdBaseType) && (op2ni == NI_Vector64_get_Zero) || (op2ni == NI_Vector128_get_Zero)) + { + GenTree* cmp = + comp->gtNewSimdHWIntrinsicNode(simdType, op1, NI_AdvSimd_Arm64_MaxAcross, CORINFO_TYPE_UBYTE, simdSize); + BlockRange().InsertBefore(node, cmp); + LowerNode(cmp); + BlockRange().Remove(op2); + + val = comp->gtNewSimdHWIntrinsicNode(TYP_INT, cmp, NI_Vector128_ToScalar, CORINFO_TYPE_UBYTE, simdSize); + BlockRange().InsertAfter(cmp, val); + LowerNode(val); + } + else { - // For TYP_SIMD12 we don't want the upper bits to participate in the comparison. So, we will insert all ones - // into those bits of the result, "as if" the upper bits are equal. Then if all lower bits are equal, we get the - // expected all-ones result, and will get the expected 0's only where there are non-matching bits. + GenTree* cmp = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, cmpIntrinsic, simdBaseJitType, simdSize); + BlockRange().InsertBefore(node, cmp); + LowerNode(cmp); - GenTree* idxCns = comp->gtNewIconNode(3, TYP_INT); - BlockRange().InsertAfter(cmp, idxCns); + if ((simdBaseType == TYP_FLOAT) && (simdSize == 12)) + { + // For TYP_SIMD12 we don't want the upper bits to participate in the comparison. So, we will insert all ones + // into those bits of the result, "as if" the upper bits are equal. Then if all lower bits are equal, we get + // the + // expected all-ones result, and will get the expected 0's only where there are non-matching bits. - GenTree* insCns = comp->gtNewIconNode(-1, TYP_INT); - BlockRange().InsertAfter(idxCns, insCns); + GenTree* idxCns = comp->gtNewIconNode(3, TYP_INT); + BlockRange().InsertAfter(cmp, idxCns); - GenTree* tmp = comp->gtNewSimdHWIntrinsicNode(simdType, cmp, idxCns, insCns, NI_AdvSimd_Insert, - CORINFO_TYPE_INT, simdSize); - BlockRange().InsertAfter(insCns, tmp); - LowerNode(tmp); + GenTree* insCns = comp->gtNewIconNode(-1, TYP_INT); + BlockRange().InsertAfter(idxCns, insCns); - cmp = tmp; - } + GenTree* tmp = comp->gtNewSimdHWIntrinsicNode(simdType, cmp, idxCns, insCns, NI_AdvSimd_Insert, + CORINFO_TYPE_INT, simdSize); + BlockRange().InsertAfter(insCns, tmp); + LowerNode(tmp); - GenTree* msk = - comp->gtNewSimdHWIntrinsicNode(simdType, cmp, NI_AdvSimd_Arm64_MinAcross, CORINFO_TYPE_UBYTE, simdSize); - BlockRange().InsertAfter(cmp, msk); - LowerNode(msk); + cmp = tmp; + } + + GenTree* msk = + comp->gtNewSimdHWIntrinsicNode(simdType, cmp, NI_AdvSimd_Arm64_MinAcross, CORINFO_TYPE_UBYTE, simdSize); + BlockRange().InsertAfter(cmp, msk); + LowerNode(msk); - GenTree* zroCns = comp->gtNewIconNode(0, TYP_INT); - BlockRange().InsertAfter(msk, zroCns); + GenTree* zroCns = comp->gtNewIconNode(0, TYP_INT); + BlockRange().InsertAfter(msk, zroCns); - GenTree* val = - comp->gtNewSimdHWIntrinsicNode(TYP_UBYTE, msk, zroCns, NI_AdvSimd_Extract, CORINFO_TYPE_UBYTE, simdSize); - BlockRange().InsertAfter(zroCns, val); - LowerNode(val); + val = comp->gtNewSimdHWIntrinsicNode(TYP_UBYTE, msk, zroCns, NI_AdvSimd_Extract, CORINFO_TYPE_UBYTE, simdSize); + BlockRange().InsertAfter(zroCns, val); + LowerNode(val); + } - zroCns = comp->gtNewIconNode(0, TYP_INT); - BlockRange().InsertAfter(val, zroCns); + GenTree* cmpZroCns = comp->gtNewIconNode(0, TYP_INT); + BlockRange().InsertAfter(val, cmpZroCns); node->ChangeOper(cmpOp); node->gtType = TYP_INT; node->AsOp()->gtOp1 = val; - node->AsOp()->gtOp2 = zroCns; + node->AsOp()->gtOp2 = cmpZroCns; // The CompareEqual will set (condition is true) or clear (condition is false) all bits of the respective element // The MinAcross then ensures we get either all bits set (all conditions are true) or clear (any condition is false) diff --git a/src/tests/JIT/HardwareIntrinsics/General/HwiOp/CompareVectorWithZero.cs b/src/tests/JIT/HardwareIntrinsics/General/HwiOp/CompareVectorWithZero.cs new file mode 100644 index 0000000000000..88df982739393 --- /dev/null +++ b/src/tests/JIT/HardwareIntrinsics/General/HwiOp/CompareVectorWithZero.cs @@ -0,0 +1,56 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; + +public class CompareVectorWithZero +{ + public static int Main() + { + Test(Vector128.Create(0)); + Test(Vector128.Create(0.0f)); + Test(Vector128.Create(-0.0f)); + Test(Vector128.Create(0.0)); + Test(Vector128.Create(-0.0)); + Test(Vector128.Create(-10)); + Test(Vector128.Create(10)); + Test(Vector128.Create((sbyte)-10)); + Test(Vector128.Create((ushort)10)); + Test(Vector64.Create(0)); + Test(Vector64.Create(0.0f)); + Test(Vector64.Create(-0.0f)); + Test(Vector64.Create(0.0)); + Test(Vector64.Create(-0.0)); + Test(Vector64.Create(-10)); + Test(Vector64.Create(10)); + Test(Vector64.Create((sbyte)-10)); + Test(Vector64.Create((ushort)10)); + Test(Vector128.Create(0, 0, 0, 0, 0, 0, 0, 1)); + Test(Vector128.Create(0, 0, 0, 0, 0, 0, 0, -1)); + Test(Vector64.Create(0, 0, 0, 0, 0, 0, 0, 1)); + Test(Vector64.Create(0, 0, 0, 0, 0, 0, 0, -1)); + return 100; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static T ToVar(T t) => t; + + [MethodImpl(MethodImplOptions.NoInlining)] + public static void AssertTrue(bool expr) + { + if (!expr) + throw new InvalidOperationException(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static void Test(Vector128 v) where T : unmanaged => + AssertTrue(v == Vector128.Zero == + v == Vector128.Create((byte)ToVar(0))); + + [MethodImpl(MethodImplOptions.NoInlining)] + public static void Test(Vector64 v) where T : unmanaged => + AssertTrue(v == Vector64.Zero == + v == Vector64.Create((byte)ToVar(0))); +} diff --git a/src/tests/JIT/HardwareIntrinsics/General/HwiOp/CompareVectorWithZero.csproj b/src/tests/JIT/HardwareIntrinsics/General/HwiOp/CompareVectorWithZero.csproj new file mode 100644 index 0000000000000..6946bed81bfd5 --- /dev/null +++ b/src/tests/JIT/HardwareIntrinsics/General/HwiOp/CompareVectorWithZero.csproj @@ -0,0 +1,9 @@ + + + Exe + True + + + + + From 31392181d71c7780092ae4f8fe5dc7f0a5dc7f41 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sun, 20 Feb 2022 16:37:56 +0300 Subject: [PATCH 2/9] Clean up --- src/coreclr/jit/lowerarmarch.cpp | 123 ++++++++++-------- .../General/HwiOp/CompareVectorWithZero.cs | 18 ++- 2 files changed, 80 insertions(+), 61 deletions(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 1522c73ddd240..5823d63dafb21 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -868,6 +868,42 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp) GenTree* op1 = node->Op(1); GenTree* op2 = node->Op(2); + // Optimize comparison against a all-zero integer vector via UMAX, basically: + // + // bool eq = v == Vector128.Zero + // + // to: + // + // bool eq = AdvSimd.Arm64.MaxAcross(v.AsByte()).ToScalar() == 0; + // + // Assume morph made get_Zero rhs if it was lhs + NamedIntrinsic op2ni = op2->OperIsHWIntrinsic() ? op2->AsHWIntrinsic()->GetHWIntrinsicId() : NI_Illegal; + if (!varTypeIsFloating(simdBaseType) && ((op2ni == NI_Vector64_get_Zero) || (op2ni == NI_Vector128_get_Zero))) + { + GenTree* cmp = + comp->gtNewSimdHWIntrinsicNode(simdType, op1, NI_AdvSimd_Arm64_MaxAcross, CORINFO_TYPE_BYTE, simdSize); + BlockRange().InsertBefore(node, cmp); + LowerNode(cmp); + BlockRange().Remove(op2); + + GenTree* val = comp->gtNewSimdHWIntrinsicNode(TYP_INT, cmp, NI_Vector128_ToScalar, CORINFO_TYPE_BYTE, simdSize); + BlockRange().InsertAfter(cmp, val); + LowerNode(val); + + GenTree* cmpZroCns = comp->gtNewIconNode(0, TYP_INT); + BlockRange().InsertAfter(val, cmpZroCns); + + node->ChangeOper(cmpOp); + node->gtType = TYP_INT; + node->AsOp()->gtOp1 = val; + node->AsOp()->gtOp2 = cmpZroCns; + LowerNodeCC(node, (cmpOp == GT_EQ) ? GenCondition::EQ : GenCondition::NE); + node->gtType = TYP_VOID; + node->ClearUnusedValue(); + LowerNode(node); + return; + } + NamedIntrinsic cmpIntrinsic; switch (simdBaseType) @@ -898,78 +934,51 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp) } } - GenTree* val = nullptr; - NamedIntrinsic op2ni = op2->AsHWIntrinsic()->GetHWIntrinsicId(); - - // Optimize comparison against a all-zero integer vector via UMAX, basically: - // - // bool eq = v == Vector128.Zero - // - // to: - // - // bool eq = AdvSimd.Arm64.MaxAcross(v.AsByte()).ToScalar() == 0; - // - // Assume morph made get_Zero rhs if it was lhs - if (!varTypeIsFloating(simdBaseType) && (op2ni == NI_Vector64_get_Zero) || (op2ni == NI_Vector128_get_Zero)) - { - GenTree* cmp = - comp->gtNewSimdHWIntrinsicNode(simdType, op1, NI_AdvSimd_Arm64_MaxAcross, CORINFO_TYPE_UBYTE, simdSize); - BlockRange().InsertBefore(node, cmp); - LowerNode(cmp); - BlockRange().Remove(op2); + GenTree* cmp = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, cmpIntrinsic, simdBaseJitType, simdSize); + BlockRange().InsertBefore(node, cmp); + LowerNode(cmp); - val = comp->gtNewSimdHWIntrinsicNode(TYP_INT, cmp, NI_Vector128_ToScalar, CORINFO_TYPE_UBYTE, simdSize); - BlockRange().InsertAfter(cmp, val); - LowerNode(val); - } - else + if ((simdBaseType == TYP_FLOAT) && (simdSize == 12)) { - GenTree* cmp = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, cmpIntrinsic, simdBaseJitType, simdSize); - BlockRange().InsertBefore(node, cmp); - LowerNode(cmp); + // For TYP_SIMD12 we don't want the upper bits to participate in the comparison. So, we will insert all ones + // into those bits of the result, "as if" the upper bits are equal. Then if all lower bits are equal, we get the + // expected all-ones result, and will get the expected 0's only where there are non-matching bits. - if ((simdBaseType == TYP_FLOAT) && (simdSize == 12)) - { - // For TYP_SIMD12 we don't want the upper bits to participate in the comparison. So, we will insert all ones - // into those bits of the result, "as if" the upper bits are equal. Then if all lower bits are equal, we get - // the - // expected all-ones result, and will get the expected 0's only where there are non-matching bits. - - GenTree* idxCns = comp->gtNewIconNode(3, TYP_INT); - BlockRange().InsertAfter(cmp, idxCns); + GenTree* idxCns = comp->gtNewIconNode(3, TYP_INT); + BlockRange().InsertAfter(cmp, idxCns); - GenTree* insCns = comp->gtNewIconNode(-1, TYP_INT); - BlockRange().InsertAfter(idxCns, insCns); + GenTree* insCns = comp->gtNewIconNode(-1, TYP_INT); + BlockRange().InsertAfter(idxCns, insCns); - GenTree* tmp = comp->gtNewSimdHWIntrinsicNode(simdType, cmp, idxCns, insCns, NI_AdvSimd_Insert, - CORINFO_TYPE_INT, simdSize); - BlockRange().InsertAfter(insCns, tmp); - LowerNode(tmp); + GenTree* tmp = comp->gtNewSimdHWIntrinsicNode(simdType, cmp, idxCns, insCns, NI_AdvSimd_Insert, + CORINFO_TYPE_INT, simdSize); + BlockRange().InsertAfter(insCns, tmp); + LowerNode(tmp); - cmp = tmp; - } + cmp = tmp; + } - GenTree* msk = - comp->gtNewSimdHWIntrinsicNode(simdType, cmp, NI_AdvSimd_Arm64_MinAcross, CORINFO_TYPE_UBYTE, simdSize); - BlockRange().InsertAfter(cmp, msk); - LowerNode(msk); + GenTree* msk = + comp->gtNewSimdHWIntrinsicNode(simdType, cmp, NI_AdvSimd_Arm64_MinAcross, CORINFO_TYPE_UBYTE, simdSize); + BlockRange().InsertAfter(cmp, msk); + LowerNode(msk); - GenTree* zroCns = comp->gtNewIconNode(0, TYP_INT); - BlockRange().InsertAfter(msk, zroCns); + GenTree* zroCns = comp->gtNewIconNode(0, TYP_INT); + BlockRange().InsertAfter(msk, zroCns); - val = comp->gtNewSimdHWIntrinsicNode(TYP_UBYTE, msk, zroCns, NI_AdvSimd_Extract, CORINFO_TYPE_UBYTE, simdSize); - BlockRange().InsertAfter(zroCns, val); - LowerNode(val); - } + GenTree* val = + comp->gtNewSimdHWIntrinsicNode(TYP_UBYTE, msk, zroCns, NI_AdvSimd_Extract, CORINFO_TYPE_UBYTE, simdSize); + BlockRange().InsertAfter(zroCns, val); + LowerNode(val); - GenTree* cmpZroCns = comp->gtNewIconNode(0, TYP_INT); - BlockRange().InsertAfter(val, cmpZroCns); + zroCns = comp->gtNewIconNode(0, TYP_INT); + BlockRange().InsertAfter(val, zroCns); node->ChangeOper(cmpOp); node->gtType = TYP_INT; node->AsOp()->gtOp1 = val; - node->AsOp()->gtOp2 = cmpZroCns; + node->AsOp()->gtOp2 = zroCns; // The CompareEqual will set (condition is true) or clear (condition is false) all bits of the respective element // The MinAcross then ensures we get either all bits set (all conditions are true) or clear (any condition is false) diff --git a/src/tests/JIT/HardwareIntrinsics/General/HwiOp/CompareVectorWithZero.cs b/src/tests/JIT/HardwareIntrinsics/General/HwiOp/CompareVectorWithZero.cs index 88df982739393..b26183b004190 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/HwiOp/CompareVectorWithZero.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/HwiOp/CompareVectorWithZero.cs @@ -31,6 +31,16 @@ public static int Main() Test(Vector128.Create(0, 0, 0, 0, 0, 0, 0, -1)); Test(Vector64.Create(0, 0, 0, 0, 0, 0, 0, 1)); Test(Vector64.Create(0, 0, 0, 0, 0, 0, 0, -1)); + + Test(Vector128.Create(0, 0, 0, 0, 0, 0, 1, 0)); + Test(Vector128.Create(0, 0, 0, 0, 0, 0, 1, 0)); + Test(Vector64.Create(0, 0, 0, 0, 0, 0, -1, 0)); + Test(Vector64.Create(0, 0, 0, 0, 0, 0, -1, 0)); + + Test(Vector128.Create(0, 0, 0, 1, 0, 0, 0, 1)); + Test(Vector128.Create(0, 0, 0, -1, 0, 0, 0, -1)); + Test(Vector64.Create(0, 0, 0, 1, 0, 0, 0, 1)); + Test(Vector64.Create(0, 0, 0, -1, 0, 0, 0, -1)); return 100; } @@ -46,11 +56,11 @@ public static void AssertTrue(bool expr) [MethodImpl(MethodImplOptions.NoInlining)] public static void Test(Vector128 v) where T : unmanaged => - AssertTrue(v == Vector128.Zero == - v == Vector128.Create((byte)ToVar(0))); + AssertTrue((v == Vector128.Zero) == + (v == Vector128.Create(ToVar(default(T))))); [MethodImpl(MethodImplOptions.NoInlining)] public static void Test(Vector64 v) where T : unmanaged => - AssertTrue(v == Vector64.Zero == - v == Vector64.Create((byte)ToVar(0))); + AssertTrue((v == Vector64.Zero) == + (v == Vector64.Create(ToVar(default(T))))); } From a4143c4a91c529e826015590defc86ad27142830 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sun, 20 Feb 2022 16:54:24 +0300 Subject: [PATCH 3/9] Update lowerarmarch.cpp --- src/coreclr/jit/lowerarmarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 5823d63dafb21..d1a0c5a9305c7 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -868,7 +868,7 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp) GenTree* op1 = node->Op(1); GenTree* op2 = node->Op(2); - // Optimize comparison against a all-zero integer vector via UMAX, basically: + // Optimize comparison against Vector64/128<>.Zero via UMAX: // // bool eq = v == Vector128.Zero // From f9d45e0bd8d7bbef88abc962e748da7ef3fb9408 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sun, 20 Feb 2022 18:39:30 +0300 Subject: [PATCH 4/9] Update lowerarmarch.cpp --- src/coreclr/jit/lowerarmarch.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index d1a0c5a9305c7..4c070b6a151a1 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -881,12 +881,12 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp) if (!varTypeIsFloating(simdBaseType) && ((op2ni == NI_Vector64_get_Zero) || (op2ni == NI_Vector128_get_Zero))) { GenTree* cmp = - comp->gtNewSimdHWIntrinsicNode(simdType, op1, NI_AdvSimd_Arm64_MaxAcross, CORINFO_TYPE_BYTE, simdSize); + comp->gtNewSimdHWIntrinsicNode(simdType, op1, NI_AdvSimd_Arm64_MaxAcross, CORINFO_TYPE_USHORT, simdSize); BlockRange().InsertBefore(node, cmp); LowerNode(cmp); BlockRange().Remove(op2); - GenTree* val = comp->gtNewSimdHWIntrinsicNode(TYP_INT, cmp, NI_Vector128_ToScalar, CORINFO_TYPE_BYTE, simdSize); + GenTree* val = comp->gtNewSimdHWIntrinsicNode(TYP_INT, cmp, NI_Vector128_ToScalar, CORINFO_TYPE_USHORT, simdSize); BlockRange().InsertAfter(cmp, val); LowerNode(val); From 51f727f51c9e70c4b95b251230f252d4bdd63d42 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sun, 20 Feb 2022 18:53:40 +0300 Subject: [PATCH 5/9] Update lowerarmarch.cpp --- src/coreclr/jit/lowerarmarch.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 4c070b6a151a1..37a31efdf629f 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -881,12 +881,12 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp) if (!varTypeIsFloating(simdBaseType) && ((op2ni == NI_Vector64_get_Zero) || (op2ni == NI_Vector128_get_Zero))) { GenTree* cmp = - comp->gtNewSimdHWIntrinsicNode(simdType, op1, NI_AdvSimd_Arm64_MaxAcross, CORINFO_TYPE_USHORT, simdSize); + comp->gtNewSimdHWIntrinsicNode(simdType, op1, NI_AdvSimd_Arm64_MaxAcross, CORINFO_TYPE_UBYTE, simdSize); BlockRange().InsertBefore(node, cmp); LowerNode(cmp); BlockRange().Remove(op2); - GenTree* val = comp->gtNewSimdHWIntrinsicNode(TYP_INT, cmp, NI_Vector128_ToScalar, CORINFO_TYPE_USHORT, simdSize); + GenTree* val = comp->gtNewSimdHWIntrinsicNode(TYP_INT, cmp, NI_Vector128_ToScalar, CORINFO_TYPE_UINT, simdSize); BlockRange().InsertAfter(cmp, val); LowerNode(val); From 70e18b79b91e3a0dea978aa561e714c7158c13c9 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Mon, 21 Feb 2022 15:21:32 +0300 Subject: [PATCH 6/9] Apply Zoltan's patch --- src/mono/mono/mini/simd-intrinsics.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index c602de2e572bc..1f69c38966c86 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -773,6 +773,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } case SN_Create: { MonoType *etype = get_vector_t_elem_type (fsig->ret); + if (!MONO_TYPE_IS_INTRINSICS_VECTOR_PRIMITIVE (etype)) + return NULL; if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype)) return emit_simd_ins (cfg, klass, type_to_expand_op (etype), args [0]->dreg, -1); else if (is_create_from_half_vectors_overload (fsig)) From c1c78315ba1da61a06512477ec54f3c5840122fd Mon Sep 17 00:00:00 2001 From: EgorBo Date: Tue, 22 Feb 2022 00:55:43 +0300 Subject: [PATCH 7/9] Address feedback --- src/coreclr/jit/lowerarmarch.cpp | 21 +++++++++++++----- .../General/HwiOp/CompareVectorWithZero.cs | 22 +++++++++++++++++++ 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 37a31efdf629f..7513371c078ed 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -876,15 +876,26 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp) // // bool eq = AdvSimd.Arm64.MaxAcross(v.AsByte()).ToScalar() == 0; // - // Assume morph made get_Zero rhs if it was lhs - NamedIntrinsic op2ni = op2->OperIsHWIntrinsic() ? op2->AsHWIntrinsic()->GetHWIntrinsicId() : NI_Illegal; - if (!varTypeIsFloating(simdBaseType) && ((op2ni == NI_Vector64_get_Zero) || (op2ni == NI_Vector128_get_Zero))) + GenTree* op = nullptr; + GenTree* opZero = nullptr; + if (op1->IsVectorZero()) + { + op = op2; + opZero = op1; + } + else if (op2->IsVectorZero()) + { + op = op1; + opZero = op2; + } + + if (!varTypeIsFloating(simdBaseType) && (op != nullptr)) { GenTree* cmp = - comp->gtNewSimdHWIntrinsicNode(simdType, op1, NI_AdvSimd_Arm64_MaxAcross, CORINFO_TYPE_UBYTE, simdSize); + comp->gtNewSimdHWIntrinsicNode(simdType, op, NI_AdvSimd_Arm64_MaxAcross, CORINFO_TYPE_UBYTE, simdSize); BlockRange().InsertBefore(node, cmp); LowerNode(cmp); - BlockRange().Remove(op2); + BlockRange().Remove(opZero); GenTree* val = comp->gtNewSimdHWIntrinsicNode(TYP_INT, cmp, NI_Vector128_ToScalar, CORINFO_TYPE_UINT, simdSize); BlockRange().InsertAfter(cmp, val); diff --git a/src/tests/JIT/HardwareIntrinsics/General/HwiOp/CompareVectorWithZero.cs b/src/tests/JIT/HardwareIntrinsics/General/HwiOp/CompareVectorWithZero.cs index b26183b004190..8024922d59a36 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/HwiOp/CompareVectorWithZero.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/HwiOp/CompareVectorWithZero.cs @@ -14,6 +14,13 @@ public static int Main() Test(Vector128.Create(-0.0f)); Test(Vector128.Create(0.0)); Test(Vector128.Create(-0.0)); + + TestReversed(Vector128.Create(0)); + TestReversed(Vector128.Create(0.0f)); + TestReversed(Vector128.Create(-0.0f)); + TestReversed(Vector128.Create(0.0)); + TestReversed(Vector128.Create(-0.0)); + Test(Vector128.Create(-10)); Test(Vector128.Create(10)); Test(Vector128.Create((sbyte)-10)); @@ -41,6 +48,11 @@ public static int Main() Test(Vector128.Create(0, 0, 0, -1, 0, 0, 0, -1)); Test(Vector64.Create(0, 0, 0, 1, 0, 0, 0, 1)); Test(Vector64.Create(0, 0, 0, -1, 0, 0, 0, -1)); + + TestReversed(Vector128.Create(0, 0, 0, 1, 0, 0, 0, 1)); + TestReversed(Vector128.Create(0, 0, 0, -1, 0, 0, 0, -1)); + TestReversed(Vector64.Create(0, 0, 0, 1, 0, 0, 0, 1)); + TestReversed(Vector64.Create(0, 0, 0, -1, 0, 0, 0, -1)); return 100; } @@ -63,4 +75,14 @@ public static void Test(Vector128 v) where T : unmanaged => public static void Test(Vector64 v) where T : unmanaged => AssertTrue((v == Vector64.Zero) == (v == Vector64.Create(ToVar(default(T))))); + + [MethodImpl(MethodImplOptions.NoInlining)] + public static void TestReversed(Vector128 v) where T : unmanaged => + AssertTrue((Vector128.Zero == v) == + (v == Vector128.Create(ToVar(default(T))))); + + [MethodImpl(MethodImplOptions.NoInlining)] + public static void TestReversed(Vector64 v) where T : unmanaged => + AssertTrue((Vector64.Zero == v) == + (v == Vector64.Create(ToVar(default(T))))); } From 59609c447faa659675d0852a406399cc5ecf986e Mon Sep 17 00:00:00 2001 From: EgorBo Date: Tue, 22 Feb 2022 23:52:08 +0300 Subject: [PATCH 8/9] Address feedback --- src/coreclr/jit/lowerarmarch.cpp | 11 ++++++----- src/mono/mono/mini/simd-intrinsics.c | 2 -- src/tests/issues.targets | 4 ++++ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 7513371c078ed..fe4f01917a6be 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -874,7 +874,7 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp) // // to: // - // bool eq = AdvSimd.Arm64.MaxAcross(v.AsByte()).ToScalar() == 0; + // bool eq = AdvSimd.Arm64.MaxAcross(v.AsUInt16()).ToScalar() == 0; // GenTree* op = nullptr; GenTree* opZero = nullptr; @@ -891,8 +891,9 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp) if (!varTypeIsFloating(simdBaseType) && (op != nullptr)) { + // Use USHORT variant as it has better latency/throughput on some CPUs than UBYTE GenTree* cmp = - comp->gtNewSimdHWIntrinsicNode(simdType, op, NI_AdvSimd_Arm64_MaxAcross, CORINFO_TYPE_UBYTE, simdSize); + comp->gtNewSimdHWIntrinsicNode(simdType, op, NI_AdvSimd_Arm64_MaxAcross, CORINFO_TYPE_USHORT, simdSize); BlockRange().InsertBefore(node, cmp); LowerNode(cmp); BlockRange().Remove(opZero); @@ -901,13 +902,13 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp) BlockRange().InsertAfter(cmp, val); LowerNode(val); - GenTree* cmpZroCns = comp->gtNewIconNode(0, TYP_INT); - BlockRange().InsertAfter(val, cmpZroCns); + GenTree* cmpZeroCns = comp->gtNewIconNode(0, TYP_INT); + BlockRange().InsertAfter(val, cmpZeroCns); node->ChangeOper(cmpOp); node->gtType = TYP_INT; node->AsOp()->gtOp1 = val; - node->AsOp()->gtOp2 = cmpZroCns; + node->AsOp()->gtOp2 = cmpZeroCns; LowerNodeCC(node, (cmpOp == GT_EQ) ? GenCondition::EQ : GenCondition::NE); node->gtType = TYP_VOID; node->ClearUnusedValue(); diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 8b32557bd126e..4bc0199739d6d 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -776,8 +776,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } case SN_Create: { MonoType *etype = get_vector_t_elem_type (fsig->ret); - if (!MONO_TYPE_IS_INTRINSICS_VECTOR_PRIMITIVE (etype)) - return NULL; if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype)) return emit_simd_ins (cfg, klass, type_to_expand_op (etype), args [0]->dreg, -1); else if (is_create_from_half_vectors_overload (fsig)) diff --git a/src/tests/issues.targets b/src/tests/issues.targets index 81424d1de65eb..c5e27cc88c43a 100644 --- a/src/tests/issues.targets +++ b/src/tests/issues.targets @@ -1449,6 +1449,10 @@ Crashes during LLVM AOT compilation. + + https://github.com/dotnet/runtime/pull/65632#issuecomment-1046294324 + + Doesn't pass after LLVM AOT compilation. From bc9220b69c403a5b6eccb3aa6cbc8d993745a92b Mon Sep 17 00:00:00 2001 From: EgorBo Date: Wed, 23 Feb 2022 00:27:22 +0300 Subject: [PATCH 9/9] use UINT for V128 --- src/coreclr/jit/lowerarmarch.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index fe4f01917a6be..e7d74f303da0b 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -891,9 +891,9 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp) if (!varTypeIsFloating(simdBaseType) && (op != nullptr)) { - // Use USHORT variant as it has better latency/throughput on some CPUs than UBYTE - GenTree* cmp = - comp->gtNewSimdHWIntrinsicNode(simdType, op, NI_AdvSimd_Arm64_MaxAcross, CORINFO_TYPE_USHORT, simdSize); + // Use USHORT for V64 and UINT for V128 due to better latency/TP on some CPUs + CorInfoType maxType = (simdSize == 8) ? CORINFO_TYPE_USHORT : CORINFO_TYPE_UINT; + GenTree* cmp = comp->gtNewSimdHWIntrinsicNode(simdType, op, NI_AdvSimd_Arm64_MaxAcross, maxType, simdSize); BlockRange().InsertBefore(node, cmp); LowerNode(cmp); BlockRange().Remove(opZero);