From 87c86aa6b93aea3d1603c1759a17fb6b5ba6e814 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Thu, 29 Aug 2024 10:42:44 +0800
Subject: [PATCH 1/4] [X86,SimplifyCFG] Support hoisting load/store with
 conditional faulting (Part I) (#96878)

This is simplifycfg part of
https://github.com/llvm/llvm-project/pull/95515

In this PR, we support hoisting load/store with conditional faulting in
`SimplifyCFGOpt::speculativelyExecuteBB` to eliminate conditional
branches.
This is for cases like
```
void test (int a, int *b) {
  if (a)
   *b = a;
}
```

In the following patches, we will support the hoist in
`SimplifyCFGOpt::hoistCommonCodeFromSuccessors`.
That is for cases like
```
void test (int a, int *c, int *d) {
  if (a)
   *c = a;
  else
   *d = a;
}
```
---
 .../Transforms/Utils/SimplifyCFGOptions.h     |   5 +
 llvm/lib/Passes/PassBuilder.cpp               |   2 +
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   8 +-
 .../lib/Transforms/Scalar/SimplifyCFGPass.cpp |  10 +
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 164 ++++-
 llvm/test/Other/new-pm-print-pipeline.ll      |   4 +-
 .../X86/masked-memory-ops-with-cf.ll          |  40 +
 .../X86/hoist-loads-stores-with-cf.ll         | 694 ++++++++++++++++++
 8 files changed, 913 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops-with-cf.ll
 create mode 100644 llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll

diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
index 2ea9d64f03cb64..ee3cc950cdb503 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
@@ -27,6 +27,7 @@ struct SimplifyCFGOptions {
   bool ConvertSwitchToLookupTable = false;
   bool NeedCanonicalLoop = true;
   bool HoistCommonInsts = false;
+  bool HoistLoadsStoresWithCondFaulting = false;
   bool SinkCommonInsts = false;
   bool SimplifyCondBranch = true;
   bool SpeculateBlocks = true;
@@ -59,6 +60,10 @@ struct SimplifyCFGOptions {
     HoistCommonInsts = B;
     return *this;
   }
+  SimplifyCFGOptions &hoistLoadsStoresWithCondFaulting(bool B) {
+    HoistLoadsStoresWithCondFaulting = B;
+    return *this;
+  }
   SimplifyCFGOptions &sinkCommonInsts(bool B) {
     SinkCommonInsts = B;
     return *this;
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 17eed97fd950c9..63173c4abb8191 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -848,6 +848,8 @@ Expected<SimplifyCFGOptions> parseSimplifyCFGOptions(StringRef Params) {
       Result.needCanonicalLoops(Enable);
     } else if (ParamName == "hoist-common-insts") {
       Result.hoistCommonInsts(Enable);
+    } else if (ParamName == "hoist-loads-stores-with-cond-faulting") {
+      Result.hoistLoadsStoresWithCondFaulting(Enable);
     } else if (ParamName == "sink-common-insts") {
       Result.sinkCommonInsts(Enable);
     } else if (ParamName == "speculate-unpredictables") {
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 1184123c7710f0..9c3d49cabbd38c 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1534,9 +1534,11 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
 
   // LoopSink (and other loop passes since the last simplifyCFG) might have
   // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
-  OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
-                                         .convertSwitchRangeToICmp(true)
-                                         .speculateUnpredictables(true)));
+  OptimizePM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions()
+                          .convertSwitchRangeToICmp(true)
+                          .speculateUnpredictables(true)
+                          .hoistLoadsStoresWithCondFaulting(true)));
 
   // Add the core optimizing pipeline.
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM),
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 11de37f7a7c109..daa82a8c368e2b 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -73,6 +73,11 @@ static cl::opt<bool> UserHoistCommonInsts(
     "hoist-common-insts", cl::Hidden, cl::init(false),
     cl::desc("hoist common instructions (default = false)"));
 
+static cl::opt<bool> UserHoistLoadsStoresWithCondFaulting(
+    "hoist-loads-stores-with-cond-faulting", cl::Hidden, cl::init(false),
+    cl::desc("Hoist loads/stores if the target supports conditional faulting "
+             "(default = false)"));
+
 static cl::opt<bool> UserSinkCommonInsts(
     "sink-common-insts", cl::Hidden, cl::init(false),
     cl::desc("Sink common instructions (default = false)"));
@@ -326,6 +331,9 @@ static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) {
     Options.NeedCanonicalLoop = UserKeepLoops;
   if (UserHoistCommonInsts.getNumOccurrences())
     Options.HoistCommonInsts = UserHoistCommonInsts;
+  if (UserHoistLoadsStoresWithCondFaulting.getNumOccurrences())
+    Options.HoistLoadsStoresWithCondFaulting =
+        UserHoistLoadsStoresWithCondFaulting;
   if (UserSinkCommonInsts.getNumOccurrences())
     Options.SinkCommonInsts = UserSinkCommonInsts;
   if (UserSpeculateUnpredictables.getNumOccurrences())
@@ -354,6 +362,8 @@ void SimplifyCFGPass::printPipeline(
      << "switch-to-lookup;";
   OS << (Options.NeedCanonicalLoop ? "" : "no-") << "keep-loops;";
   OS << (Options.HoistCommonInsts ? "" : "no-") << "hoist-common-insts;";
+  OS << (Options.HoistLoadsStoresWithCondFaulting ? "" : "no-")
+     << "hoist-loads-stores-with-cond-faulting;";
   OS << (Options.SinkCommonInsts ? "" : "no-") << "sink-common-insts;";
   OS << (Options.SpeculateBlocks ? "" : "no-") << "speculate-blocks;";
   OS << (Options.SimplifyCondBranch ? "" : "no-") << "simplify-cond-branch;";
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 92e2d189aff6ff..15de40c7b09962 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -117,6 +117,18 @@ static cl::opt<bool>
     HoistCommon("simplifycfg-hoist-common", cl::Hidden, cl::init(true),
                 cl::desc("Hoist common instructions up to the parent block"));
 
+static cl::opt<bool> HoistLoadsStoresWithCondFaulting(
+    "simplifycfg-hoist-loads-stores-with-cond-faulting", cl::Hidden,
+    cl::init(true),
+    cl::desc("Hoist loads/stores if the target supports "
+             "conditional faulting"));
+
+static cl::opt<unsigned> HoistLoadsStoresWithCondFaultingThreshold(
+    "hoist-loads-stores-with-cond-faulting-threshold", cl::Hidden, cl::init(6),
+    cl::desc("Control the maximal conditonal load/store that we are willing "
+             "to speculatively execute to eliminate conditional branch "
+             "(default = 6)"));
+
 static cl::opt<unsigned>
     HoistCommonSkipLimit("simplifycfg-hoist-common-skip-limit", cl::Hidden,
                          cl::init(20),
@@ -2986,6 +2998,25 @@ static bool isProfitableToSpeculate(const BranchInst *BI, bool Invert,
   return BIEndProb < Likely;
 }
 
+static bool isSafeCheapLoadStore(const Instruction *I,
+                                 const TargetTransformInfo &TTI) {
+  // Not handle volatile or atomic.
+  if (auto *L = dyn_cast<LoadInst>(I)) {
+    if (!L->isSimple())
+      return false;
+  } else if (auto *S = dyn_cast<StoreInst>(I)) {
+    if (!S->isSimple())
+      return false;
+  } else
+    return false;
+
+  // llvm.masked.load/store use i32 for alignment while load/store use i64.
+  // That's why we have the alignment limitation.
+  // FIXME: Update the prototype of the intrinsics?
+  return TTI.hasConditionalLoadStoreForType(getLoadStoreType(I)) &&
+         getLoadStoreAlignment(I) < Value::MaximumAlignment;
+}
+
 /// Speculate a conditional basic block flattening the CFG.
 ///
 /// Note that this is a very risky transform currently. Speculating
@@ -3060,6 +3091,9 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   SmallVector<Instruction *, 4> SpeculatedDbgIntrinsics;
 
   unsigned SpeculatedInstructions = 0;
+  bool HoistLoadsStores = HoistLoadsStoresWithCondFaulting &&
+                          Options.HoistLoadsStoresWithCondFaulting;
+  SmallVector<Instruction *, 2> SpeculatedConditionalLoadsStores;
   Value *SpeculatedStoreValue = nullptr;
   StoreInst *SpeculatedStore = nullptr;
   EphemeralValueTracker EphTracker;
@@ -3088,22 +3122,33 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
 
     // Only speculatively execute a single instruction (not counting the
     // terminator) for now.
-    ++SpeculatedInstructions;
+    bool IsSafeCheapLoadStore = HoistLoadsStores &&
+                                isSafeCheapLoadStore(&I, TTI) &&
+                                SpeculatedConditionalLoadsStores.size() <
+                                    HoistLoadsStoresWithCondFaultingThreshold;
+    // Not count load/store into cost if target supports conditional faulting
+    // b/c it's cheap to speculate it.
+    if (IsSafeCheapLoadStore)
+      SpeculatedConditionalLoadsStores.push_back(&I);
+    else
+      ++SpeculatedInstructions;
+
     if (SpeculatedInstructions > 1)
       return false;
 
     // Don't hoist the instruction if it's unsafe or expensive.
-    if (!isSafeToSpeculativelyExecute(&I) &&
-        !(HoistCondStores && (SpeculatedStoreValue = isSafeToSpeculateStore(
-                                  &I, BB, ThenBB, EndBB))))
+    if (!IsSafeCheapLoadStore && !isSafeToSpeculativelyExecute(&I) &&
+        !(HoistCondStores && !SpeculatedStoreValue &&
+          (SpeculatedStoreValue =
+               isSafeToSpeculateStore(&I, BB, ThenBB, EndBB))))
       return false;
-    if (!SpeculatedStoreValue &&
+    if (!IsSafeCheapLoadStore && !SpeculatedStoreValue &&
         computeSpeculationCost(&I, TTI) >
             PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic)
       return false;
 
     // Store the store speculation candidate.
-    if (SpeculatedStoreValue)
+    if (!SpeculatedStore && SpeculatedStoreValue)
       SpeculatedStore = cast<StoreInst>(&I);
 
     // Do not hoist the instruction if any of its operands are defined but not
@@ -3130,11 +3175,11 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
 
   // Check that we can insert the selects and that it's not too expensive to do
   // so.
-  bool Convert = SpeculatedStore != nullptr;
+  bool Convert =
+      SpeculatedStore != nullptr || !SpeculatedConditionalLoadsStores.empty();
   InstructionCost Cost = 0;
   Convert |= validateAndCostRequiredSelects(BB, ThenBB, EndBB,
-                                            SpeculatedInstructions,
-                                            Cost, TTI);
+                                            SpeculatedInstructions, Cost, TTI);
   if (!Convert || Cost > Budget)
     return false;
 
@@ -3222,6 +3267,107 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   BB->splice(BI->getIterator(), ThenBB, ThenBB->begin(),
              std::prev(ThenBB->end()));
 
+  // If the target supports conditional faulting,
+  // we look for the following pattern:
+  // \code
+  //   BB:
+  //     ...
+  //     %cond = icmp ult %x, %y
+  //     br i1 %cond, label %TrueBB, label %FalseBB
+  //   FalseBB:
+  //     store i32 1, ptr %q, align 4
+  //     ...
+  //   TrueBB:
+  //     %maskedloadstore = load i32, ptr %b, align 4
+  //     store i32 %maskedloadstore, ptr %p, align 4
+  //     ...
+  // \endcode
+  //
+  // and transform it into:
+  //
+  // \code
+  //   BB:
+  //     ...
+  //     %cond = icmp ult %x, %y
+  //     %maskedloadstore = cload i32, ptr %b, %cond
+  //     cstore i32 %maskedloadstore, ptr %p, %cond
+  //     cstore i32 1, ptr %q, ~%cond
+  //     br i1 %cond, label %TrueBB, label %FalseBB
+  //   FalseBB:
+  //     ...
+  //   TrueBB:
+  //     ...
+  // \endcode
+  //
+  // where cload/cstore are represented by llvm.masked.load/store intrinsics,
+  // e.g.
+  //
+  // \code
+  //   %vcond = bitcast i1 %cond to <1 x i1>
+  //   %v0 = call <1 x i32> @llvm.masked.load.v1i32.p0
+  //                         (ptr %b, i32 4, <1 x i1> %vcond, <1 x i32> poison)
+  //   %maskedloadstore = bitcast <1 x i32> %v0 to i32
+  //   call void @llvm.masked.store.v1i32.p0
+  //                          (<1 x i32> %v0, ptr %p, i32 4, <1 x i1> %vcond)
+  //   %cond.not = xor i1 %cond, true
+  //   %vcond.not = bitcast i1 %cond.not to <1 x i>
+  //   call void @llvm.masked.store.v1i32.p0
+  //              (<1 x i32> <i32 1>, ptr %q, i32 4, <1x i1> %vcond.not)
+  // \endcode
+  //
+  // So we need to turn hoisted load/store into cload/cstore.
+  auto &Context = BI->getParent()->getContext();
+  auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
+  auto *Cond = BI->getOperand(0);
+  Value *Mask = nullptr;
+  // Construct the condition if needed.
+  if (!SpeculatedConditionalLoadsStores.empty()) {
+    IRBuilder<> Builder(SpeculatedConditionalLoadsStores.back());
+    Mask = Builder.CreateBitCast(
+        Invert ? Builder.CreateXor(Cond, ConstantInt::getTrue(Context)) : Cond,
+        VCondTy);
+  }
+  for (auto *I : SpeculatedConditionalLoadsStores) {
+    IRBuilder<> Builder(I);
+    // We currently assume conditional faulting load/store is supported for
+    // scalar types only when creating new instructions. This can be easily
+    // extended for vector types in the future.
+    assert(!getLoadStoreType(I)->isVectorTy() && "not implemented");
+    auto *Op0 = I->getOperand(0);
+    Instruction *MaskedLoadStore = nullptr;
+    if (auto *LI = dyn_cast<LoadInst>(I)) {
+      // Handle Load.
+      auto *Ty = I->getType();
+      MaskedLoadStore = Builder.CreateMaskedLoad(FixedVectorType::get(Ty, 1),
+                                                 Op0, LI->getAlign(), Mask);
+      I->replaceAllUsesWith(Builder.CreateBitCast(MaskedLoadStore, Ty));
+    } else {
+      // Handle Store.
+      auto *StoredVal =
+          Builder.CreateBitCast(Op0, FixedVectorType::get(Op0->getType(), 1));
+      MaskedLoadStore = Builder.CreateMaskedStore(
+          StoredVal, I->getOperand(1), cast<StoreInst>(I)->getAlign(), Mask);
+    }
+    // For non-debug metadata, only !annotation, !range, !nonnull and !align are
+    // kept when hoisting (see Instruction::dropUBImplyingAttrsAndMetadata).
+    //
+    // !nonnull, !align : Not support pointer type, no need to keep.
+    // !range: Load type is changed from scalar to vector, but the metadata on
+    //         vector specifies a per-element range, so the semantics stay the
+    //         same. Keep it.
+    // !annotation: Not impact semantics. Keep it.
+    I->dropUBImplyingAttrsAndUnknownMetadata(
+        {LLVMContext::MD_range, LLVMContext::MD_annotation});
+    // FIXME: DIAssignID is not supported for masked store yet.
+    // (Verifier::visitDIAssignIDMetadata)
+    at::deleteAssignmentMarkers(I);
+    I->eraseMetadataIf([](unsigned MDKind, MDNode *Node) {
+      return Node->getMetadataID() == Metadata::DIAssignIDKind;
+    });
+    MaskedLoadStore->copyMetadata(*I);
+    I->eraseFromParent();
+  }
+
   // Insert selects and rewrite the PHI operands.
   IRBuilder<NoFolder> Builder(BI);
   for (PHINode &PN : EndBB->phis()) {
diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll
index f2e80814f347ad..12f88d60d66cec 100644
--- a/llvm/test/Other/new-pm-print-pipeline.ll
+++ b/llvm/test/Other/new-pm-print-pipeline.ll
@@ -49,8 +49,8 @@
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(print<stack-lifetime><may>,print<stack-lifetime><must>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-17
 ; CHECK-17: function(print<stack-lifetime><may>,print<stack-lifetime><must>)
 
-; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-18
-; CHECK-18: function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;no-switch-range-to-icmp;switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)
+; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;switch-to-lookup;keep-loops;hoist-common-insts;hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-18
+; CHECK-18: function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;no-switch-range-to-icmp;switch-to-lookup;keep-loops;hoist-common-insts;hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)
 
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only>,loop-vectorize<interleave-forced-only;vectorize-forced-only>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-19
 ; CHECK-19: function(loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,loop-vectorize<interleave-forced-only;vectorize-forced-only;>)
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops-with-cf.ll b/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops-with-cf.ll
new file mode 100644
index 00000000000000..405a26de3d6afa
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops-with-cf.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -O1 -S | FileCheck %s
+
+;; Test masked.load/store.v1* is generated in simplifycfg and not falls back to branch+load/store in following passes.
+define void @basic(i1 %cond, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @basic(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[B:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <1 x i16>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP7]], ptr [[B]], i32 2, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP8]], ptr [[P]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP6]] to <1 x i64>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> [[TMP9]], ptr [[Q]], i32 8, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  br label %if.end
+
+if.true:
+  %pv = load i16, ptr %p, align 2
+  %qv = load i32, ptr %q, align 4
+  %bv = load i64, ptr %b, align 8
+  store i16 %pv, ptr %b, align 2
+  store i32 %qv, ptr %p, align 4
+  store i64 %bv, ptr %q, align 8
+  br label %if.false
+
+if.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
new file mode 100644
index 00000000000000..047ca717da8009
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
@@ -0,0 +1,694 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -passes='simplifycfg<hoist-loads-stores-with-cond-faulting>' -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
+
+;; Basic case: check masked.load/store is generated for i16/i32/i64.
+define void @basic(i1 %cond, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @basic(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[B:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <1 x i16>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP7]], ptr [[B]], i32 2, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP8]], ptr [[P]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP6]] to <1 x i64>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> [[TMP9]], ptr [[Q]], i32 8, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  br label %if.end
+
+if.true:
+  %0 = load i16, ptr %p, align 2
+  %1 = load i32, ptr %q, align 4
+  %2 = load i64, ptr %b, align 8
+  store i16 %0, ptr %b, align 2
+  store i32 %1, ptr %p, align 4
+  store i64 %2, ptr %q, align 8
+  br label %if.false
+
+if.end:
+  ret void
+}
+
+;; Successor 1 branches to successor 0.
+define void @succ1to0(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @succ1to0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[TOBOOL]], true
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP4]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tobool = icmp ne i32 %a, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.end:
+  ret void
+
+if.then:
+  %0 = load i32, ptr %q
+  store i32 %0, ptr %p
+  br label %if.end
+}
+
+;; Successor 1 branches to successor 0 and there is a phi node.
+define i32 @succ1to0_phi(ptr %p)  {
+; CHECK-LABEL: @succ1to0_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i32 0, i32 [[TMP3]]
+; CHECK-NEXT:    ret i32 [[SPEC_SELECT]]
+;
+entry:
+  %cond = icmp eq ptr %p, null
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  %0 = load i32, ptr %p
+  br label %if.true
+
+if.true:
+  %res = phi i32 [ %0, %if.false ], [ 0, %entry ]
+  ret i32 %res
+}
+
+;; Successor 0 branches to successor 1.
+define void @succ0to1(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @succ0to1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i32 1, ptr %q
+  br label %if.end
+
+if.true:
+  %0 = load i32, ptr %b
+  store i32 %0, ptr %p
+  br label %if.false
+
+if.end:
+  ret void
+}
+
+;; Load after store can be hoisted.
+define i64 @load_after_store(i32 %a, ptr %b, ptr %p) {
+; CHECK-LABEL: @load_after_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[TMP2]] to i64
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i64 [[ZEXT]], i64 0
+; CHECK-NEXT:    ret i64 [[SPEC_SELECT]]
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.end
+
+if.true:
+  store i32 1, ptr %b
+  %0 = load i16, ptr %p
+  %zext = zext i16 %0 to i64
+  ret i64 %zext
+
+if.end:
+  ret i64 0
+}
+
+;; Speculatable memory read doesn't prevent the hoist.
+define void @load_skip_speculatable_memory_read(i32 %a, ptr %p, ptr %q) {
+; CHECK-LABEL: @load_skip_speculatable_memory_read(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[READ:%.*]] = call i32 @read_memory_only()
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[READ]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP1]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i32 1, ptr %q
+  br label %if.end
+
+if.true:
+  %read = call i32 @read_memory_only()
+  store i32 %read, ptr %p
+  br label %if.false
+
+if.end:
+  ret void
+}
+
+;; Source of the load can be a GEP.
+define i32 @load_from_gep(ptr %p)  {
+; CHECK-LABEL: @load_from_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[ARRAYIDX]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i32 0, i32 [[TMP3]]
+; CHECK-NEXT:    ret i32 [[SPEC_SELECT]]
+;
+entry:
+  %cond = icmp eq ptr %p, null
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  %arrayidx = getelementptr inbounds i8, ptr %p, i64 16
+  %0 = load i32, ptr %arrayidx
+  br label %if.true
+
+if.true:
+  %res = phi i32 [ %0, %if.false ], [ 0, %entry ]
+  ret i32 %res
+}
+
+;; Metadata range/annotation are kept.
+define void @nondebug_metadata(i1 %cond, ptr %p, ptr %q) {
+; CHECK-LABEL: @nondebug_metadata(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison), !range [[RNG5:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison), !annotation [[META6:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <1 x i16>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP5]], ptr [[Q]], i32 4, <1 x i1> [[TMP0]]), !annotation [[META6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP6]], ptr [[P]], i32 2, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  ret void
+
+if.true:
+  %0 = load i16, ptr %p, align 2, !range !{i16 0, i16 10}
+  %1 = load i32, ptr %q, align 4, !annotation !11
+  store i16 %0, ptr %q, align 4, !annotation !11
+  store i32 %1, ptr %p, align 2
+  br label %if.false
+}
+
+define i16 @debug_metadata_diassign(i1 %cond, i16 %a, ptr %p) {
+; CHECK-LABEL: @debug_metadata_diassign(
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> <i16 7>, ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i16 3, i16 2
+; CHECK-NEXT:    ret i16 [[SPEC_SELECT]]
+;
+bb0:
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  store i16 7, ptr %p, align 4, !DIAssignID !9
+  br label %if.false
+
+if.false:
+  %ret = phi i16 [ 2, %bb0 ], [ 3, %if.true ]
+  call void @llvm.dbg.assign(metadata i16 %ret, metadata !8, metadata !DIExpression(), metadata !9, metadata ptr %p, metadata !DIExpression()), !dbg !7
+  ret i16 %ret
+}
+
+;; Not crash when working with opt controlled by simplifycfg-hoist-cond-stores.
+define i32 @hoist_cond_stores(i1 %cond, ptr %p) {
+; CHECK-LABEL: @hoist_cond_stores(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i1 false, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[SPEC_STORE_SELECT:%.*]] = select i1 [[COND:%.*]], i1 false, i1 false
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr [[P]], i32 8, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    store i1 [[SPEC_STORE_SELECT]], ptr [[P]], align 2
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  store i1 false, ptr %p, align 2
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:                            ; preds = %entry
+  store i32 0, ptr %p, align 8
+  store i1 false, ptr %p, align 2
+  br label %if.false
+
+if.false:                                    ; preds = %if.true, %entry
+  ret i32 0
+}
+
+;; Both of successor 0 and successor 1 have a single predecessor.
+;; TODO: Support transform for this case.
+define void @single_predecessor(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @single_predecessor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       if.end:
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Q]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+entry:
+  %tobool = icmp ne i32 %a, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.end:
+  store i32 1, ptr %q
+  ret void
+
+if.then:
+  %0 = load i32, ptr %q
+  store i32 %0, ptr %p
+  ret void
+}
+
+;; Hoist 6 stores.
+define void @threshold_6(i1 %cond, ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, ptr %p6) {
+; CHECK-LABEL: @threshold_6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr [[P1:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 2>, ptr [[P2:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 3>, ptr [[P3:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 4>, ptr [[P4:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 5>, ptr [[P5:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 6>, ptr [[P6:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  store i32 1, ptr %p1, align 4
+  store i32 2, ptr %p2, align 4
+  store i32 3, ptr %p3, align 4
+  store i32 4, ptr %p4, align 4
+  store i32 5, ptr %p5, align 4
+  store i32 6, ptr %p6, align 4
+  br label %if.false
+
+if.false:
+  ret void
+}
+
+;; Not hoist 7 stores.
+define void @threshold_7(i1 %cond, ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, ptr %p6, ptr %p7) {
+; CHECK-LABEL: @threshold_7(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    store i32 1, ptr [[P1:%.*]], align 4
+; CHECK-NEXT:    store i32 2, ptr [[P2:%.*]], align 4
+; CHECK-NEXT:    store i32 3, ptr [[P3:%.*]], align 4
+; CHECK-NEXT:    store i32 4, ptr [[P4:%.*]], align 4
+; CHECK-NEXT:    store i32 5, ptr [[P5:%.*]], align 4
+; CHECK-NEXT:    store i32 6, ptr [[P6:%.*]], align 4
+; CHECK-NEXT:    store i32 7, ptr [[P7:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_FALSE]]
+; CHECK:       if.false:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  store i32 1, ptr %p1, align 4
+  store i32 2, ptr %p2, align 4
+  store i32 3, ptr %p3, align 4
+  store i32 4, ptr %p4, align 4
+  store i32 5, ptr %p5, align 4
+  store i32 6, ptr %p6, align 4
+  store i32 7, ptr %p7, align 4
+  br label %if.false
+
+if.false:
+  ret void
+}
+
+;; Not do hoist if the cost of instructions to be hoisted is expensive.
+define i32 @not_cheap_to_hoist(i32 %a, ptr %b, ptr %p, ptr %q, i32 %v0, i32 %v1, i32 %v2, i1 %cc) {
+; CHECK-LABEL: @not_cheap_to_hoist(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ [[VVVV:%.*]], [[IF_FALSE]] ], [ 0, [[IF_TRUE]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+; CHECK:       if.false:
+; CHECK-NEXT:    store i64 1, ptr [[P:%.*]], align 8
+; CHECK-NEXT:    store i16 2, ptr [[Q:%.*]], align 2
+; CHECK-NEXT:    [[V:%.*]] = udiv i32 [[A]], 12345
+; CHECK-NEXT:    [[VV:%.*]] = mul i32 [[V]], [[V0:%.*]]
+; CHECK-NEXT:    [[VVV:%.*]] = mul i32 [[VV]], [[V1:%.*]]
+; CHECK-NEXT:    [[VVVV]] = select i1 [[CC:%.*]], i32 [[V2:%.*]], i32 [[VVV]]
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P]], align 4
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i64 1, ptr %p
+  store i16 2, ptr %q
+
+  %v = udiv i32 %a, 12345
+  %vv = mul i32 %v, %v0
+  %vvv = mul i32 %vv, %v1
+  %vvvv = select i1 %cc, i32 %v2, i32 %vvv
+  ret i32 %vvvv
+
+if.true:
+  %0 = load i32, ptr %b
+  store i32 %0, ptr %p
+  br label %if.end
+
+if.end:
+  ret i32 0
+}
+
+;; Not hoist if there is more than 1 prodecessor.
+define void @not_single_predecessor(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @not_single_predecessor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    br label [[IF_THEN]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+;
+entry:
+  %tobool = icmp ne i32 %a, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.end:
+  br label %if.then
+
+if.then:
+  %1 = load i32, ptr %q
+  store i32 %1, ptr %p
+  br label %if.end
+}
+
+;; Not hoist b/c i8 is not supported by conditional faulting.
+define void @not_supported_type(i8 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_supported_type(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    store i8 1, ptr [[Q:%.*]], align 1
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[B:%.*]], align 1
+; CHECK-NEXT:    store i8 [[TMP0]], ptr [[P:%.*]], align 1
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i8 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i8 1, ptr %q
+  br label %if.end
+
+if.true:
+  %0 = load i8, ptr %b
+  store i8 %0, ptr %p
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+;; Not hoist if the terminator is not br.
+define void @not_br_terminator(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_br_terminator(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[A:%.*]], label [[IF_END:%.*]] [
+; CHECK-NEXT:      i32 1, label [[IF_FALSE:%.*]]
+; CHECK-NEXT:      i32 2, label [[IF_TRUE:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       if.false:
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_FALSE]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  switch i32 %a, label %if.end [
+  i32 1, label %if.false
+  i32 2, label %if.true
+  ]
+
+if.false:
+  store i32 1, ptr %q, align 4
+  br label %if.end
+
+if.true:
+  %0 = load i32, ptr %b, align 4
+  store i32 %0, ptr %p, align 4
+  br label %if.false
+
+if.end:
+  ret void
+}
+
+;; Not hoist if the instruction to be hoist is atomic.
+define void @not_atomic(i1 %cond, ptr %p) {
+; CHECK-LABEL: @not_atomic(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    store atomic i32 1, ptr [[P:%.*]] seq_cst, align 4
+; CHECK-NEXT:    br label [[IF_TRUE]]
+; CHECK:       if.true:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store atomic i32 1, ptr %p seq_cst, align 4
+  br label %if.true
+
+if.true:
+  ret void
+}
+
+;; Not hoist if the instruction to be hoist is volatile.
+define void @not_volatile(i1 %cond, ptr %p) {
+; CHECK-LABEL: @not_volatile(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_TRUE]]
+; CHECK:       if.true:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  %0 = load volatile i32, ptr %p, align 4
+  br label %if.true
+
+if.true:
+  ret void
+}
+
+;; Not hoist if there is an instruction that has side effect in the same bb.
+define void @not_hoistable_sideeffect(i1 %cond, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_hoistable_sideeffect(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    [[RMW:%.*]] = atomicrmw xchg ptr [[Q:%.*]], double 4.000000e+00 seq_cst, align 8
+; CHECK-NEXT:    store i32 1, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_TRUE]]
+; CHECK:       if.true:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  %rmw= atomicrmw xchg ptr %q, double 4.0 seq_cst
+  store i32 1, ptr %p, align 4
+  br label %if.true
+
+if.true:
+  ret void
+}
+
+;; Not hoist if the branch is predictable and the `then` BB is not likely to execute.
+define void @not_likely_to_execute(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @not_likely_to_execute(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]], !prof [[PROF7:![0-9]+]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+;
+entry:
+  %tobool = icmp ne i32 %a, 0
+  br i1 %tobool, label %if.then, label %if.end, !prof !10
+
+if.end:
+  ret void
+
+if.then:
+  %0 = load i32, ptr %q
+  store i32 %0, ptr %p
+  br label %if.end
+}
+
+;; Now the optimization hoist-loads-stores-with-cond-faulting is run in codegen,
+;; which is after sroa and alloca is optimized away. So we don't need to do the transform
+;; for this case. But in the future, it is probably moved before sroa.
+define void @not_alloca(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @not_alloca(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[Q_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store ptr [[P:%.*]], ptr [[P_ADDR]], align 8
+; CHECK-NEXT:    store ptr [[Q:%.*]], ptr [[Q_ADDR]], align 8
+; CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[Q_ADDR]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p.addr = alloca ptr
+  %q.addr = alloca ptr
+  %a.addr = alloca i32
+  store ptr %p, ptr %p.addr
+  store ptr %q, ptr %q.addr
+  store i32 %a, ptr %a.addr
+  %0 = load i32, ptr %a.addr
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+  %1 = load ptr, ptr %q.addr
+  %2 = load i32, ptr %1
+  %3 = load ptr, ptr %p.addr
+  store i32 %2, ptr %3
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+;; Not transform if alignment = 2^32.
+define void @not_maximum_alignment(i1 %cond, ptr %p) {
+; CHECK-LABEL: @not_maximum_alignment(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    store i32 0, ptr [[P:%.*]], align 4294967296
+; CHECK-NEXT:    br label [[IF_FALSE]]
+; CHECK:       if.false:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  store i32 0, ptr %p, align 4294967296
+  br label %if.false
+
+if.false:
+  ret void
+}
+
+declare i32 @read_memory_only() readonly nounwind willreturn speculatable
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "clang")
+!1 = !DIFile(filename: "foo.c", directory: "/tmp")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{!"clang"}
+!5 = !DIBasicType(name: "int", size: 16, encoding: DW_ATE_signed)
+!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 4, unit: !0)
+!7 = !DILocation(line: 5, column: 7, scope: !6)
+!8 = !DILocalVariable(name: "a", scope: !6, line: 6, type: !5)
+!9 = distinct !DIAssignID()
+!10 = !{!"branch_weights", i32 1, i32 99}
+!11 = !{ !"auto-init" }

From 121fb2c2ccc9db33278160b485ff0e9d09be9827 Mon Sep 17 00:00:00 2001
From: tcwzxx <tcwzxx@gmail.com>
Date: Thu, 29 Aug 2024 11:18:26 +0800
Subject: [PATCH 2/4] [SLP] Fix the Vec lane overridden by the shuffle mask
 (#106341)

Currently, SLP uses shuffle for the external user of `InsertElementInst`
and iterates through the `InsertElementInst` chain to fill the mask with
constant indices. However, it may override the original Vec lane. Using
the original Vec lane is sufficient.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 122 +++++-------------
 .../insertelement-across-zero.ll              |   2 +-
 2 files changed, 36 insertions(+), 88 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ef5ae9a1a9ccc6..e9128ed0a33401 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10653,6 +10653,17 @@ static T *performExtractsShuffleAction(
   return Prev;
 }
 
+namespace {
+/// Data type for handling buildvector sequences with the reused scalars from
+/// other tree entries.
+template <typename T> struct ShuffledInsertData {
+  /// List of insertelements to be replaced by shuffles.
+  SmallVector<InsertElementInst *> InsertElements;
+  /// The parent vectors and shuffle mask for the given list of inserts.
+  MapVector<T, SmallVector<int>> ValueMasks;
+};
+} // namespace
+
 InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   InstructionCost Cost = 0;
   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
@@ -10694,8 +10705,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
 
   SmallPtrSet<Value *, 16> ExtractCostCalculated;
   InstructionCost ExtractCost = 0;
-  SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
-  SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
+  SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts;
   SmallVector<APInt> DemandedElts;
   SmallDenseSet<Value *, 4> UsedInserts;
   DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
@@ -10732,11 +10742,12 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
         if (InsertIdx) {
           const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
           auto *It = find_if(
-              FirstUsers,
-              [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
+              ShuffledInserts,
+              [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
+                // Checks if 2 insertelements are from the same buildvector.
+                InsertElementInst *VecInsert = Data.InsertElements.front();
                 return areTwoInsertFromSameBuildVector(
-                    VU, cast<InsertElementInst>(Pair.first),
-                    [this](InsertElementInst *II) -> Value * {
+                    VU, VecInsert, [this](InsertElementInst *II) -> Value * {
                       Value *Op0 = II->getOperand(0);
                       if (getTreeEntry(II) && !getTreeEntry(Op0))
                         return nullptr;
@@ -10744,36 +10755,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
                     });
               });
           int VecId = -1;
-          if (It == FirstUsers.end()) {
-            (void)ShuffleMasks.emplace_back();
-            SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
-            if (Mask.empty())
-              Mask.assign(FTy->getNumElements(), PoisonMaskElem);
-            // Find the insertvector, vectorized in tree, if any.
-            Value *Base = VU;
-            while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
-              if (IEBase != EU.User &&
-                  (!IEBase->hasOneUse() ||
-                   getElementIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
-                break;
-              // Build the mask for the vectorized insertelement instructions.
-              if (const TreeEntry *E = getTreeEntry(IEBase)) {
-                VU = IEBase;
-                do {
-                  IEBase = cast<InsertElementInst>(Base);
-                  int Idx = *getElementIndex(IEBase);
-                  assert(Mask[Idx] == PoisonMaskElem &&
-                         "InsertElementInstruction used already.");
-                  Mask[Idx] = Idx;
-                  Base = IEBase->getOperand(0);
-                } while (E == getTreeEntry(Base));
-                break;
-              }
-              Base = cast<InsertElementInst>(Base)->getOperand(0);
-            }
-            FirstUsers.emplace_back(VU, ScalarTE);
+          if (It == ShuffledInserts.end()) {
+            auto &Data = ShuffledInserts.emplace_back();
+            Data.InsertElements.emplace_back(VU);
             DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
-            VecId = FirstUsers.size() - 1;
+            VecId = ShuffledInserts.size() - 1;
             auto It = MinBWs.find(ScalarTE);
             if (It != MinBWs.end() &&
                 VectorCasts
@@ -10799,12 +10785,13 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
               Cost += C;
             }
           } else {
-            if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
-              It->first = VU;
-            VecId = std::distance(FirstUsers.begin(), It);
+            if (isFirstInsertElement(VU, It->InsertElements.front()))
+              It->InsertElements.front() = VU;
+            VecId = std::distance(ShuffledInserts.begin(), It);
           }
           int InIdx = *InsertIdx;
-          SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
+          SmallVectorImpl<int> &Mask =
+              ShuffledInserts[VecId].ValueMasks[ScalarTE];
           if (Mask.empty())
             Mask.assign(FTy->getNumElements(), PoisonMaskElem);
           Mask[InIdx] = EU.Lane;
@@ -10978,9 +10965,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
     return std::make_pair(TE, false);
   };
   // Calculate the cost of the reshuffled vectors, if any.
-  for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
-    Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
-    auto Vector = ShuffleMasks[I].takeVector();
+  for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
+    Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
+    auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
     unsigned VF = 0;
     auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
                                     ArrayRef<const TreeEntry *> TEs) {
@@ -11031,7 +11018,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
         [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
         EstimateShufflesCost);
     InstructionCost InsertCost = TTI->getScalarizationOverhead(
-        cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
+        cast<FixedVectorType>(
+            ShuffledInserts[I].InsertElements.front()->getType()),
+        DemandedElts[I],
         /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
     Cost -= InsertCost;
   }
@@ -14131,17 +14120,6 @@ Value *BoUpSLP::vectorizeTree() {
   return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
 }
 
-namespace {
-/// Data type for handling buildvector sequences with the reused scalars from
-/// other tree entries.
-struct ShuffledInsertData {
-  /// List of insertelements to be replaced by shuffles.
-  SmallVector<InsertElementInst *> InsertElements;
-  /// The parent vectors and shuffle mask for the given list of inserts.
-  MapVector<Value *, SmallVector<int>> ValueMasks;
-};
-} // namespace
-
 Value *BoUpSLP::vectorizeTree(
     const ExtraValueToDebugLocsMap &ExternallyUsedValues,
     SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
@@ -14279,7 +14257,7 @@ Value *BoUpSLP::vectorizeTree(
   LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
                     << " values .\n");
 
-  SmallVector<ShuffledInsertData> ShuffledInserts;
+  SmallVector<ShuffledInsertData<Value *>> ShuffledInserts;
   // Maps vector instruction to original insertelement instruction
   DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
   // Maps extract Scalar to the corresponding extractelement instruction in the
@@ -14492,8 +14470,8 @@ Value *BoUpSLP::vectorizeTree(
 
           std::optional<unsigned> InsertIdx = getElementIndex(VU);
           if (InsertIdx) {
-            auto *It =
-                find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
+            auto *It = find_if(
+                ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
                   // Checks if 2 insertelements are from the same buildvector.
                   InsertElementInst *VecInsert = Data.InsertElements.front();
                   return areTwoInsertFromSameBuildVector(
@@ -14505,36 +14483,6 @@ Value *BoUpSLP::vectorizeTree(
               (void)ShuffledInserts.emplace_back();
               It = std::next(ShuffledInserts.begin(),
                              ShuffledInserts.size() - 1);
-              SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
-              if (Mask.empty())
-                Mask.assign(FTy->getNumElements(), PoisonMaskElem);
-              // Find the insertvector, vectorized in tree, if any.
-              Value *Base = VU;
-              while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
-                if (IEBase != User &&
-                    (!IEBase->hasOneUse() ||
-                     getElementIndex(IEBase).value_or(Idx) == Idx))
-                  break;
-                // Build the mask for the vectorized insertelement instructions.
-                if (const TreeEntry *E = getTreeEntry(IEBase)) {
-                  do {
-                    IEBase = cast<InsertElementInst>(Base);
-                    int IEIdx = *getElementIndex(IEBase);
-                    assert(Mask[IEIdx] == PoisonMaskElem &&
-                           "InsertElementInstruction used already.");
-                    Mask[IEIdx] = IEIdx;
-                    Base = IEBase->getOperand(0);
-                  } while (E == getTreeEntry(Base));
-                  break;
-                }
-                Base = cast<InsertElementInst>(Base)->getOperand(0);
-                // After the vectorization the def-use chain has changed, need
-                // to look through original insertelement instructions, if they
-                // get replaced by vector instructions.
-                auto It = VectorToInsertElement.find(Base);
-                if (It != VectorToInsertElement.end())
-                  Base = It->second;
-              }
             }
             SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
             if (Mask.empty())
diff --git a/llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll
index 1d54f59f2fdd84..28afa40640bf63 100644
--- a/llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll
@@ -12,7 +12,7 @@ define void @test(i8 %0, i8 %1) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 0, i32 7
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> zeroinitializer, [[TMP9]]
 ; CHECK-NEXT:    ret void

From ee6961dbf13167bf09b602b136d72f72d7c8ff0c Mon Sep 17 00:00:00 2001
From: Nathan Ridge <zeratul976@hotmail.com>
Date: Wed, 28 Aug 2024 23:37:01 -0400
Subject: [PATCH 3/4] [clangd] Do not collect macros when clang-tidy checks
 call into the preprocessor (#106329)

Fixes https://github.com/llvm/llvm-project/issues/99617
---
 clang-tools-extra/clangd/CollectMacros.cpp      |  1 +
 clang-tools-extra/clangd/CollectMacros.h        |  8 ++++++++
 clang-tools-extra/clangd/ParsedAST.cpp          |  8 +++++++-
 .../clangd/unittests/DiagnosticsTests.cpp       | 17 +++++++++++++++++
 4 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/clangd/CollectMacros.cpp b/clang-tools-extra/clangd/CollectMacros.cpp
index c5ba8d903ba482..96298ee3ea50ae 100644
--- a/clang-tools-extra/clangd/CollectMacros.cpp
+++ b/clang-tools-extra/clangd/CollectMacros.cpp
@@ -32,6 +32,7 @@ void CollectMainFileMacros::add(const Token &MacroNameTok, const MacroInfo *MI,
   if (Loc.isInvalid() || Loc.isMacroID())
     return;
 
+  assert(isInsideMainFile(Loc, SM));
   auto Name = MacroNameTok.getIdentifierInfo()->getName();
   Out.Names.insert(Name);
   size_t Start = SM.getFileOffset(Loc);
diff --git a/clang-tools-extra/clangd/CollectMacros.h b/clang-tools-extra/clangd/CollectMacros.h
index e3900c08e5df7b..e7198641d8d53c 100644
--- a/clang-tools-extra/clangd/CollectMacros.h
+++ b/clang-tools-extra/clangd/CollectMacros.h
@@ -82,6 +82,14 @@ class CollectMainFileMacros : public PPCallbacks {
 
   void SourceRangeSkipped(SourceRange R, SourceLocation EndifLoc) override;
 
+  // Called when the AST build is done to disable further recording
+  // of macros by this class. This is needed because some clang-tidy
+  // checks can trigger PP callbacks by calling directly into the
+  // preprocessor. Such calls are not interleaved with FileChanged()
+  // in the expected way, leading this class to erroneously process
+  // macros that are not in the main file.
+  void doneParse() { InMainFile = false; }
+
 private:
   void add(const Token &MacroNameTok, const MacroInfo *MI,
            bool IsDefinition = false, bool InConditionalDirective = false);
diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp
index 14440acd08b353..a8b6cc8dd0a46f 100644
--- a/clang-tools-extra/clangd/ParsedAST.cpp
+++ b/clang-tools-extra/clangd/ParsedAST.cpp
@@ -688,7 +688,9 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs,
     Marks = Patch->marks();
   }
   auto &PP = Clang->getPreprocessor();
-  PP.addPPCallbacks(std::make_unique<CollectMainFileMacros>(PP, Macros));
+  auto MacroCollector = std::make_unique<CollectMainFileMacros>(PP, Macros);
+  auto *MacroCollectorPtr = MacroCollector.get(); // so we can call doneParse()
+  PP.addPPCallbacks(std::move(MacroCollector));
 
   PP.addPPCallbacks(
       collectPragmaMarksCallback(Clang->getSourceManager(), Marks));
@@ -709,6 +711,10 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs,
     log("Execute() failed when building AST for {0}: {1}", MainInput.getFile(),
         toString(std::move(Err)));
 
+  // Disable the macro collector for the remainder of this function, e.g.
+  // clang-tidy checkers.
+  MacroCollectorPtr->doneParse();
+
   // We have to consume the tokens before running clang-tidy to avoid collecting
   // tokens from running the preprocessor inside the checks (only
   // modernize-use-trailing-return-type does that today).
diff --git a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
index 25d2f03e0b366b..096f77e414f5a5 100644
--- a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
@@ -940,6 +940,23 @@ TEST(DiagnosticTest, ClangTidySelfContainedDiagsFormatting) {
                         withFix(equalToFix(ExpectedFix2))))));
 }
 
+TEST(DiagnosticsTest, ClangTidyCallingIntoPreprocessor) {
+  std::string Main = R"cpp(
+    extern "C" {
+    #include "b.h"
+    }
+  )cpp";
+  std::string Header = R"cpp(
+    #define EXTERN extern
+    EXTERN int waldo();
+  )cpp";
+  auto TU = TestTU::withCode(Main);
+  TU.AdditionalFiles["b.h"] = Header;
+  TU.ClangTidyProvider = addTidyChecks("modernize-use-trailing-return-type");
+  // Check that no assertion failures occur during the build
+  TU.build();
+}
+
 TEST(DiagnosticsTest, Preprocessor) {
   // This looks like a preamble, but there's an #else in the middle!
   // Check that:

From 82ebd333a889d2372c8445dc3d5d527ec48537db Mon Sep 17 00:00:00 2001
From: Jacob Lalonde <jalalonde@fb.com>
Date: Wed, 28 Aug 2024 21:15:30 -0700
Subject: [PATCH 4/4] [LLDB][Minidumps] Read x64 registers as 64b and handle
 truncation in the file builder (#106473)

This patch addresses a bug where `cs`/`fs` and other segmentation flags
were being identified as having a type of `32b` and `64b` for `rflags`.
In that case the register value was returning the fail value `0xF...`
and this was corrupting some minidumps. Here we just read it as a 64b
value and truncate it.

In addition to that fix, I added comparing the registers from the live
process to the loaded core for the generic minidump test. Prior only
being ARM register tests. This explains why this was not detected
before.
---
 .../Minidump/MinidumpFileBuilder.cpp          | 14 ++++----
 .../TestProcessSaveCoreMinidump.py            | 36 +++++++++++++++++--
 2 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
index 689a3fb0e84852..13355afb58dbd1 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
@@ -491,12 +491,14 @@ GetThreadContext_x86_64(RegisterContext *reg_ctx) {
   thread_context.r14 = read_register_u64(reg_ctx, "r14");
   thread_context.r15 = read_register_u64(reg_ctx, "r15");
   thread_context.rip = read_register_u64(reg_ctx, "rip");
-  thread_context.eflags = read_register_u32(reg_ctx, "rflags");
-  thread_context.cs = read_register_u16(reg_ctx, "cs");
-  thread_context.fs = read_register_u16(reg_ctx, "fs");
-  thread_context.gs = read_register_u16(reg_ctx, "gs");
-  thread_context.ss = read_register_u16(reg_ctx, "ss");
-  thread_context.ds = read_register_u16(reg_ctx, "ds");
+  // To make our code agnostic to whatever type the register value identifies
+  // itself as, we read as a u64 and truncate to u32/u16 ourselves.
+  thread_context.eflags = read_register_u64(reg_ctx, "rflags");
+  thread_context.cs = read_register_u64(reg_ctx, "cs");
+  thread_context.fs = read_register_u64(reg_ctx, "fs");
+  thread_context.gs = read_register_u64(reg_ctx, "gs");
+  thread_context.ss = read_register_u64(reg_ctx, "ss");
+  thread_context.ds = read_register_u64(reg_ctx, "ds");
   return thread_context;
 }
 
diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
index 5abaa05a90f63e..ea59aef004aff5 100644
--- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
+++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
@@ -17,6 +17,7 @@ def verify_core_file(
         expected_modules,
         expected_threads,
         stacks_to_sps_map,
+        stacks_to_registers_map,
     ):
         # To verify, we'll launch with the mini dump
         target = self.dbg.CreateTarget(None)
@@ -62,6 +63,17 @@ def verify_core_file(
             # Try to read just past the red zone and fail
             process.ReadMemory(sp - red_zone - 1, 1, error)
             self.assertTrue(error.Fail(), "No failure when reading past the red zone")
+            # Verify the registers are the same
+            self.assertIn(thread_id, stacks_to_registers_map)
+            register_val_list = stacks_to_registers_map[thread_id]
+            frame_register_list = frame.GetRegisters()
+            for x in register_val_list:
+                self.assertEqual(
+                    x.GetValueAsUnsigned(),
+                    frame_register_list.GetFirstValueByName(
+                        x.GetName()
+                    ).GetValueAsUnsigned(),
+                )
 
         self.dbg.DeleteTarget(target)
 
@@ -93,12 +105,16 @@ def test_save_linux_mini_dump(self):
             expected_number_of_threads = process.GetNumThreads()
             expected_threads = []
             stacks_to_sp_map = {}
+            stakcs_to_registers_map = {}
 
             for thread_idx in range(process.GetNumThreads()):
                 thread = process.GetThreadAtIndex(thread_idx)
                 thread_id = thread.GetThreadID()
                 expected_threads.append(thread_id)
                 stacks_to_sp_map[thread_id] = thread.GetFrameAtIndex(0).GetSP()
+                stakcs_to_registers_map[thread_id] = thread.GetFrameAtIndex(
+                    0
+                ).GetRegisters()
 
             # save core and, kill process and verify corefile existence
             base_command = "process save-core --plugin-name=minidump "
@@ -110,6 +126,7 @@ def test_save_linux_mini_dump(self):
                 expected_modules,
                 expected_threads,
                 stacks_to_sp_map,
+                stakcs_to_registers_map,
             )
 
             self.runCmd(base_command + " --style=modified-memory '%s'" % (core_dirty))
@@ -120,6 +137,7 @@ def test_save_linux_mini_dump(self):
                 expected_modules,
                 expected_threads,
                 stacks_to_sp_map,
+                stakcs_to_registers_map,
             )
 
             self.runCmd(base_command + " --style=full '%s'" % (core_full))
@@ -130,6 +148,7 @@ def test_save_linux_mini_dump(self):
                 expected_modules,
                 expected_threads,
                 stacks_to_sp_map,
+                stakcs_to_registers_map,
             )
 
             options = lldb.SBSaveCoreOptions()
@@ -147,6 +166,7 @@ def test_save_linux_mini_dump(self):
                 expected_modules,
                 expected_threads,
                 stacks_to_sp_map,
+                stakcs_to_registers_map,
             )
 
             options = lldb.SBSaveCoreOptions()
@@ -163,6 +183,7 @@ def test_save_linux_mini_dump(self):
                 expected_modules,
                 expected_threads,
                 stacks_to_sp_map,
+                stakcs_to_registers_map,
             )
 
             # Minidump can now save full core files, but they will be huge and
@@ -181,6 +202,7 @@ def test_save_linux_mini_dump(self):
                 expected_modules,
                 expected_threads,
                 stacks_to_sp_map,
+                stakcs_to_registers_map,
             )
 
             self.assertSuccess(process.Kill())
@@ -276,13 +298,16 @@ def test_save_linux_mini_dump_default_options(self):
             expected_threads = []
             stacks_to_sp_map = {}
             expected_pid = process.GetProcessInfo().GetProcessID()
+            stacks_to_registers_map = {}
 
             for thread_idx in range(process.GetNumThreads()):
                 thread = process.GetThreadAtIndex(thread_idx)
                 thread_id = thread.GetThreadID()
                 expected_threads.append(thread_id)
                 stacks_to_sp_map[thread_id] = thread.GetFrameAtIndex(0).GetSP()
-
+                stacks_to_registers_map[thread_id] = thread.GetFrameAtIndex(
+                    0
+                ).GetRegisters()
 
             # This is almost identical to the single thread test case because
             # minidump defaults to stacks only, so we want to see if the
@@ -294,7 +319,14 @@ def test_save_linux_mini_dump_default_options(self):
             error = process.SaveCore(options)
             self.assertTrue(error.Success())
 
-            self.verify_core_file(default_value_file, expected_pid, expected_modules, expected_threads, stacks_to_sp_map)
+            self.verify_core_file(
+                default_value_file,
+                expected_pid,
+                expected_modules,
+                expected_threads,
+                stacks_to_sp_map,
+                stacks_to_registers_map,
+            )
 
         finally:
             self.assertTrue(self.dbg.DeleteTarget(target))