dotnet · EgorBo · Oct 4, 2021 · Jul 17, 2021 · Jul 17, 2021 · Jul 17, 2021
diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp
@@ -20388,8 +20388,33 @@ GenTree* Compiler::impInlineFetchArg(unsigned lclNum, InlArgInfo* inlArgInfo, In
             // TODO-1stClassStructs: We currently do not reuse an existing lclVar
             // if it is a struct, because it requires some additional handling.
 
-            if (!varTypeIsStruct(lclTyp) && !argInfo.argHasSideEff && !argInfo.argHasGlobRef &&
-                !argInfo.argHasCallerLocalRef)
+            bool substitute = false;
+            switch (argNode->OperGet())
+            {
+#ifdef FEATURE_HW_INTRINSICS
+                case GT_HWINTRINSIC:
+                {
+                    // Enable for all parameterless (=invariant) hw intrinsics such as
+                    // Vector128<>.Zero and Vector256<>.AllBitSets. We might consider
+                    // doing that for Vector.Create(cns) as well.
+                    if ((argNode->gtGetOp1() == nullptr) && (argNode->gtGetOp2() == nullptr))
+                    {
+                        substitute = true;
+                    }
+                    break;
+                }
+#endif
+
+                // TODO: Enable substitution for CORINFO_HELP_TYPEHANDLE_TO_RUNTIMETYPE (typeof(T))
+                // but in order to benefit from that, we need to move various "typeof + IsValueType"
+                // optimizations from importer to morph.
+
+                default:
+                    break;
+            }
+
+            if (substitute || (!varTypeIsStruct(lclTyp) && !argInfo.argHasSideEff && !argInfo.argHasGlobRef &&
+                               !argInfo.argHasCallerLocalRef))
             {
                 /* Get a *LARGE* LCL_VAR node */
                 op1 = gtNewLclLNode(tmpNum, genActualType(lclTyp) DEBUGARG(lclNum));

diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
@@ -14242,6 +14242,47 @@ GenTree* Compiler::fgMorphSmpOpOptional(GenTreeOp* tree)
             }
             break;
 
+#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
+        case GT_HWINTRINSIC:
+        {
+            GenTreeHWIntrinsic* hw = tree->AsHWIntrinsic();
+            switch (hw->gtHWIntrinsicId)
+            {
+                case NI_SSE_Xor:
+                case NI_SSE2_Xor:
+                case NI_AVX_Xor:
+                case NI_AVX2_Xor:
+                {
+                    // Transform XOR(X, 0) to X for vectors
+                    GenTree* op1 = hw->gtGetOp1();
+                    GenTree* op2 = hw->gtGetOp2();
+                    if (!gtIsActiveCSE_Candidate(tree))
+                    {
+                        if (op1->IsIntegralConstVector(0) && !gtIsActiveCSE_Candidate(op1))
+                        {
+                            DEBUG_DESTROY_NODE(tree);
+                            DEBUG_DESTROY_NODE(op1);
+                            INDEBUG(op2->gtDebugFlags |= GTF_DEBUG_NODE_MORPHED);
+                            return op2;
+                        }
+                        if (op2->IsIntegralConstVector(0) && !gtIsActiveCSE_Candidate(op2))
+                        {
+                            DEBUG_DESTROY_NODE(tree);
+                            DEBUG_DESTROY_NODE(op2);
+                            INDEBUG(op1->gtDebugFlags |= GTF_DEBUG_NODE_MORPHED);
+                            return op1;
+                        }
+                    }
+                    break;
+                }
+
+                default:
+                    break;
+            }
+            break;
+        }
+#endif // defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
+
         default:
             break;
     }

diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128_1.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128_1.cs
@@ -123,6 +123,12 @@ public bool Equals(Vector128<T> other)
                     Vector128<double> result = Sse2.CompareEqual(this.AsDouble(), other.AsDouble());
                     return Sse2.MoveMask(result) == 0b11; // We have one bit per element
                 }
+                else if (Sse41.IsSupported)
+                {
+                    // xor + testz is slightly better for integer types
+                    Vector128<byte> xored = Sse2.Xor(this.AsByte(), other.AsByte());
+                    return Sse41.TestZ(xored, xored);
+                }
                 else
                 {
                     // Unlike float/double, there are no special values to consider

diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256_1.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256_1.cs
@@ -134,8 +134,9 @@ public bool Equals(Vector256<T> other)
                 // bytes are exactly the same.
 
                 Debug.Assert((typeof(T) != typeof(float)) && (typeof(T) != typeof(double)));
-                Vector256<byte> result = Avx2.CompareEqual(this.AsByte(), other.AsByte());
-                return Avx2.MoveMask(result) == unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111)); // We have one bit per element
+
+                Vector256<byte> xored = Avx2.Xor(this.AsByte(), other.AsByte());
+                return Avx.TestZ(xored, xored);
             }
 
             return SoftwareFallback(in this, other);