[InferAlignment] Implement InferAlignmentPass

This pass aims to infer alignment for instructions as a separate pass, to reduce redundant work done by InstCombine running multiple times. It runs late in the pipeline, just before the back-end passes where this information is most useful. Differential Revision: https://reviews.llvm.org/D158529
llvm · Sep 20, 2023 · 0f152a5 · 0f152a5
1 parent 3978f37
commit 0f152a5
Show file tree

Hide file tree

Showing 20 changed files with 250 additions and 117 deletions.
diff --git a/llvm/include/llvm/Transforms/Scalar/InferAlignment.h b/llvm/include/llvm/Transforms/Scalar/InferAlignment.h
@@ -0,0 +1,27 @@
+//===- InferAlignment.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Infer alignment for load, stores and other memory operations based on
+// trailing zero known bits information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_INFERALIGNMENT_H
+#define LLVM_TRANSFORMS_SCALAR_INFERALIGNMENT_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+struct InferAlignmentPass : public PassInfoMixin<InferAlignmentPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_INFERALIGNMENT_H
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -213,6 +213,15 @@ AllocaInst *DemoteRegToStack(Instruction &X,
 /// deleted and it returns the pointer to the alloca inserted.
 AllocaInst *DemotePHIToStack(PHINode *P, Instruction *AllocaPoint = nullptr);
 
+/// If the specified pointer points to an object that we control, try to modify
+/// the object's alignment to PrefAlign. Returns a minimum known alignment of
+/// the value after the operation, which may be lower than PrefAlign.
+///
+/// Increating value alignment isn't often possible though. If alignment is
+/// important, a more reliable approach is to simply align all global variables
+/// and allocation instructions to their preferred alignment from the beginning.
+Align tryEnforceAlignment(Value *V, Align PrefAlign, const DataLayout &DL);
+
 /// Try to ensure that the alignment of \p V is at least \p PrefAlign bytes. If
 /// the owning object can be modified and has an alignment less than \p
 /// PrefAlign, it will be increased and \p PrefAlign returned. If the alignment

diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
@@ -171,6 +171,7 @@
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
 #include "llvm/Transforms/Scalar/InductiveRangeCheckElimination.h"
 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
+#include "llvm/Transforms/Scalar/InferAlignment.h"
 #include "llvm/Transforms/Scalar/InstSimplifyPass.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
 #include "llvm/Transforms/Scalar/LICM.h"

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -88,6 +88,7 @@
 #include "llvm/Transforms/Scalar/Float2Int.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
+#include "llvm/Transforms/Scalar/InferAlignment.h"
 #include "llvm/Transforms/Scalar/InstSimplifyPass.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
 #include "llvm/Transforms/Scalar/LICM.h"
@@ -274,6 +275,11 @@ cl::opt<bool> EnableMemProfContextDisambiguation(
     "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
     cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
 
+cl::opt<bool> EnableInferAlignmentPass(
+    "enable-infer-alignment-pass", cl::init(false), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Enable the InferAlignment pass, disabling alignment inference in "
+             "InstCombine"));
+
 PipelineTuningOptions::PipelineTuningOptions() {
   LoopInterleaving = true;
   LoopVectorization = true;
@@ -1140,6 +1146,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   FPM.addPass(LoopVectorizePass(
       LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
 
+  if (EnableInferAlignmentPass)
+    FPM.addPass(InferAlignmentPass());
   if (IsFullLTO) {
     // The vectorizer may have significantly shortened a loop body; unroll
     // again. Unroll small loops to hide loop backedge latency and saturate any
@@ -1257,6 +1265,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     FPM.addPass(SROAPass(SROAOptions::PreserveCFG));
   }
 
+  if (EnableInferAlignmentPass)
+    FPM.addPass(InferAlignmentPass());
   FPM.addPass(InstCombinePass());
 
   // This is needed for two reasons:

diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
@@ -337,6 +337,7 @@ FUNCTION_PASS("gvn-hoist", GVNHoistPass())
 FUNCTION_PASS("gvn-sink", GVNSinkPass())
 FUNCTION_PASS("helloworld", HelloWorldPass())
 FUNCTION_PASS("infer-address-spaces", InferAddressSpacesPass())
+FUNCTION_PASS("infer-alignment", InferAlignmentPass())
 FUNCTION_PASS("instcount", InstCountPass())
 FUNCTION_PASS("instsimplify", InstSimplifyPass())
 FUNCTION_PASS("invalidate<all>", InvalidateAllAnalysesPass())

diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -22,6 +22,7 @@ add_llvm_component_library(LLVMScalarOpts
   InductiveRangeCheckElimination.cpp
   IndVarSimplify.cpp
   InferAddressSpaces.cpp
+  InferAlignment.cpp
   InstSimplifyPass.cpp
   JumpThreading.cpp
   LICM.cpp

diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -0,0 +1,91 @@
+//===- InferAlignment.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Infer alignment for load, stores and other memory operations based on
+// trailing zero known bits information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/InferAlignment.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+static bool tryToImproveAlign(
+    const DataLayout &DL, Instruction *I,
+    function_ref<Align(Value *PtrOp, Align OldAlign, Align PrefAlign)> Fn) {
+  if (auto *LI = dyn_cast<LoadInst>(I)) {
+    Value *PtrOp = LI->getPointerOperand();
+    Align OldAlign = LI->getAlign();
+    Align NewAlign = Fn(PtrOp, OldAlign, DL.getPrefTypeAlign(LI->getType()));
+    if (NewAlign > OldAlign) {
+      LI->setAlignment(NewAlign);
+      return true;
+    }
+  } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+    Value *PtrOp = SI->getPointerOperand();
+    Value *ValOp = SI->getValueOperand();
+    Align OldAlign = SI->getAlign();
+    Align NewAlign = Fn(PtrOp, OldAlign, DL.getPrefTypeAlign(ValOp->getType()));
+    if (NewAlign > OldAlign) {
+      SI->setAlignment(NewAlign);
+      return true;
+    }
+  }
+  // TODO: Also handle memory intrinsics.
+  return false;
+}
+
+bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  bool Changed = false;
+
+  // Enforce preferred type alignment if possible. We do this as a separate
+  // pass first, because it may improve the alignments we infer below.
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      Changed |= tryToImproveAlign(
+          DL, &I, [&](Value *PtrOp, Align OldAlign, Align PrefAlign) {
+            if (PrefAlign > OldAlign)
+              return std::max(OldAlign,
+                              tryEnforceAlignment(PtrOp, PrefAlign, DL));
+            return OldAlign;
+          });
+    }
+  }
+
+  // Compute alignment from known bits.
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      Changed |= tryToImproveAlign(
+          DL, &I, [&](Value *PtrOp, Align OldAlign, Align PrefAlign) {
+            KnownBits Known = computeKnownBits(PtrOp, DL, 0, &AC, &I, &DT);
+            unsigned TrailZ = std::min(Known.countMinTrailingZeros(),
+                                       +Value::MaxAlignmentExponent);
+            return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+          });
+    }
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses InferAlignmentPass::run(Function &F,
+                                          FunctionAnalysisManager &AM) {
+  AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  inferAlignment(F, AC, DT);
+  // Changes to alignment shouldn't invalidated analyses.
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
@@ -1388,15 +1388,8 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
   return Changed;
 }
 
-/// If the specified pointer points to an object that we control, try to modify
-/// the object's alignment to PrefAlign. Returns a minimum known alignment of
-/// the value after the operation, which may be lower than PrefAlign.
-///
-/// Increating value alignment isn't often possible though. If alignment is
-/// important, a more reliable approach is to simply align all global variables
-/// and allocation instructions to their preferred alignment from the beginning.
-static Align tryEnforceAlignment(Value *V, Align PrefAlign,
-                                 const DataLayout &DL) {
+Align llvm::tryEnforceAlignment(Value *V, Align PrefAlign,
+                                const DataLayout &DL) {
   V = V->stripPointerCasts();
 
   if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {

diff --git a/llvm/test/Transforms/InferAlignment/alloca.ll b/llvm/test/Transforms/InferAlignment/alloca.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+; RUN: opt < %s -passes=infer-alignment -S | FileCheck %s
 
 ; ------------------------------------------------------------------------------
 ; Scalar type
@@ -8,11 +8,11 @@
 define void @alloca_local(i8 %x, i32 %y) {
 ; CHECK-LABEL: define void @alloca_local
 ; CHECK-SAME: (i8 [[X:%.*]], i32 [[Y:%.*]]) {
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 1
-; CHECK-NEXT:    [[LOAD_I8:%.*]] = load i8, ptr [[ALLOCA]], align 1
-; CHECK-NEXT:    [[LOAD_I32:%.*]] = load i32, ptr [[ALLOCA]], align 1
-; CHECK-NEXT:    store i8 [[X]], ptr [[ALLOCA]], align 1
-; CHECK-NEXT:    store i32 [[Y]], ptr [[ALLOCA]], align 1
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LOAD_I8:%.*]] = load i8, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    [[LOAD_I32:%.*]] = load i32, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    store i8 [[X]], ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    store i32 [[Y]], ptr [[ALLOCA]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca i32, align 1
@@ -38,10 +38,10 @@ define void @alloca_struct(i32 %x) {
 ; CHECK-NEXT:    [[ALLOCA_STRUCT:%.*]] = alloca [[STRUCT_PAIR:%.*]], align 8
 ; CHECK-NEXT:    [[GEP_0:%.*]] = getelementptr [[STRUCT_PAIR]], ptr [[ALLOCA_STRUCT]], i64 0, i32 1
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr { i32, i32 }, ptr [[GEP_0]], i64 0, i32 1
-; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, ptr [[GEP_0]], align 1
-; CHECK-NEXT:    store i32 0, ptr [[GEP_0]], align 1
-; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, ptr [[GEP_1]], align 1
-; CHECK-NEXT:    store i32 0, ptr [[GEP_1]], align 1
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, ptr [[GEP_0]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[GEP_0]], align 8
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, ptr [[GEP_1]], align 4
+; CHECK-NEXT:    store i32 0, ptr [[GEP_1]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %alloca.struct = alloca %struct.pair

diff --git a/llvm/test/Transforms/InferAlignment/atomic.ll b/llvm/test/Transforms/InferAlignment/atomic.ll
@@ -1,15 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -S < %s -passes=no-op-function | FileCheck %s
+; RUN: opt -S < %s -passes=infer-alignment | FileCheck %s
 
 ; ------------------------------------------------------------------------------
 ; load/store of null
 ; ------------------------------------------------------------------------------
 
 define void @load_null() {
 ; CHECK-LABEL: define void @load_null() {
-; CHECK-NEXT:    [[X_0:%.*]] = load atomic i32, ptr null unordered, align 4
-; CHECK-NEXT:    [[X_1:%.*]] = load atomic i32, ptr null monotonic, align 4
-; CHECK-NEXT:    [[X_2:%.*]] = load atomic i32, ptr null seq_cst, align 4
+; CHECK-NEXT:    [[X_0:%.*]] = load atomic i32, ptr null unordered, align 4294967296
+; CHECK-NEXT:    [[X_1:%.*]] = load atomic i32, ptr null monotonic, align 4294967296
+; CHECK-NEXT:    [[X_2:%.*]] = load atomic i32, ptr null seq_cst, align 4294967296
 ; CHECK-NEXT:    ret void
 ;
   %x.0 = load atomic i32, ptr null unordered, align 4
@@ -20,9 +20,9 @@ define void @load_null() {
 
 define void @store_null() {
 ; CHECK-LABEL: define void @store_null() {
-; CHECK-NEXT:    store atomic i32 0, ptr null unordered, align 4
-; CHECK-NEXT:    store atomic i32 0, ptr null monotonic, align 4
-; CHECK-NEXT:    store atomic i32 0, ptr null seq_cst, align 4
+; CHECK-NEXT:    store atomic i32 0, ptr null unordered, align 4294967296
+; CHECK-NEXT:    store atomic i32 0, ptr null monotonic, align 4294967296
+; CHECK-NEXT:    store atomic i32 0, ptr null seq_cst, align 4294967296
 ; CHECK-NEXT:    ret void
 ;
   store atomic i32 0, ptr null unordered, align 4
@@ -38,9 +38,9 @@ define void @store_null() {
 
 define void @load_nonnull() {
 ; CHECK-LABEL: define void @load_nonnull() {
-; CHECK-NEXT:    [[X_0:%.*]] = load atomic i32, ptr @c unordered, align 4
-; CHECK-NEXT:    [[X_1:%.*]] = load atomic i32, ptr @c monotonic, align 4
-; CHECK-NEXT:    [[X_2:%.*]] = load atomic i32, ptr @c seq_cst, align 4
+; CHECK-NEXT:    [[X_0:%.*]] = load atomic i32, ptr @c unordered, align 8
+; CHECK-NEXT:    [[X_1:%.*]] = load atomic i32, ptr @c monotonic, align 8
+; CHECK-NEXT:    [[X_2:%.*]] = load atomic i32, ptr @c seq_cst, align 8
 ; CHECK-NEXT:    ret void
 ;
   %x.0 = load atomic i32, ptr @c unordered, align 4
@@ -51,9 +51,9 @@ define void @load_nonnull() {
 
 define void @store_nonnull() {
 ; CHECK-LABEL: define void @store_nonnull() {
-; CHECK-NEXT:    store atomic i32 0, ptr @c unordered, align 4
-; CHECK-NEXT:    store atomic i32 0, ptr @c monotonic, align 4
-; CHECK-NEXT:    store atomic i32 0, ptr @c seq_cst, align 4
+; CHECK-NEXT:    store atomic i32 0, ptr @c unordered, align 8
+; CHECK-NEXT:    store atomic i32 0, ptr @c monotonic, align 8
+; CHECK-NEXT:    store atomic i32 0, ptr @c seq_cst, align 8
 ; CHECK-NEXT:    ret void
 ;
   store atomic i32 0, ptr @c unordered, align 4
@@ -69,9 +69,9 @@ define void @store_nonnull() {
 define void @load_alloca() {
 ; CHECK-LABEL: define void @load_alloca() {
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[X_0:%.*]] = load atomic i32, ptr [[ALLOCA]] unordered, align 1
-; CHECK-NEXT:    [[X_1:%.*]] = load atomic i32, ptr [[ALLOCA]] monotonic, align 1
-; CHECK-NEXT:    [[X_2:%.*]] = load atomic i32, ptr [[ALLOCA]] seq_cst, align 1
+; CHECK-NEXT:    [[X_0:%.*]] = load atomic i32, ptr [[ALLOCA]] unordered, align 4
+; CHECK-NEXT:    [[X_1:%.*]] = load atomic i32, ptr [[ALLOCA]] monotonic, align 4
+; CHECK-NEXT:    [[X_2:%.*]] = load atomic i32, ptr [[ALLOCA]] seq_cst, align 4
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca i32
@@ -84,9 +84,9 @@ define void @load_alloca() {
 define void @store_alloca() {
 ; CHECK-LABEL: define void @store_alloca() {
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store atomic i32 0, ptr [[ALLOCA]] unordered, align 1
-; CHECK-NEXT:    store atomic i32 0, ptr [[ALLOCA]] monotonic, align 1
-; CHECK-NEXT:    store atomic i32 0, ptr [[ALLOCA]] seq_cst, align 1
+; CHECK-NEXT:    store atomic i32 0, ptr [[ALLOCA]] unordered, align 4
+; CHECK-NEXT:    store atomic i32 0, ptr [[ALLOCA]] monotonic, align 4
+; CHECK-NEXT:    store atomic i32 0, ptr [[ALLOCA]] seq_cst, align 4
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca i32

diff --git a/llvm/test/Transforms/InferAlignment/attributes.ll b/llvm/test/Transforms/InferAlignment/attributes.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+; RUN: opt < %s -passes=infer-alignment -S | FileCheck %s
 
 define void @attribute(ptr align 32 %a) {
 ; CHECK-LABEL: define void @attribute
 ; CHECK-SAME: (ptr align 32 [[A:%.*]]) {
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A]], align 1
-; CHECK-NEXT:    store i32 123, ptr [[A]], align 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A]], align 32
+; CHECK-NEXT:    store i32 123, ptr [[A]], align 32
 ; CHECK-NEXT:    ret void
 ;
   %load = load i32, ptr %a, align 1
@@ -17,8 +17,8 @@ define void @attribute_through_call(ptr align 32 %a) {
 ; CHECK-LABEL: define void @attribute_through_call
 ; CHECK-SAME: (ptr align 32 [[A:%.*]]) {
 ; CHECK-NEXT:    [[RES:%.*]] = call ptr @call(ptr [[A]])
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[RES]], align 1
-; CHECK-NEXT:    store i32 123, ptr [[RES]], align 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[RES]], align 32
+; CHECK-NEXT:    store i32 123, ptr [[RES]], align 32
 ; CHECK-NEXT:    ret void
 ;
   %res = call ptr @call(ptr %a)
@@ -31,8 +31,8 @@ define void @attribute_return_value(ptr %a) {
 ; CHECK-LABEL: define void @attribute_return_value
 ; CHECK-SAME: (ptr [[A:%.*]]) {
 ; CHECK-NEXT:    [[RES:%.*]] = call align 32 ptr @call(ptr [[A]])
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[RES]], align 1
-; CHECK-NEXT:    store i32 123, ptr [[RES]], align 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[RES]], align 32
+; CHECK-NEXT:    store i32 123, ptr [[RES]], align 32
 ; CHECK-NEXT:    ret void
 ;
   %res = call align 32 ptr @call(ptr %a)

diff --git a/llvm/test/Transforms/InferAlignment/gep-2d.ll b/llvm/test/Transforms/InferAlignment/gep-2d.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=no-op-function -S | FileCheck %s
+; RUN: opt < %s -passes=infer-alignment -S | FileCheck %s
 
 ; A multi-dimensional array in a nested loop.inner doing vector stores that
 ; aren't yet aligned. InferAlignment can understand the addressing in the
@@ -21,8 +21,8 @@ define void @nested_loop() {
 ; CHECK:       loop.inner:
 ; CHECK-NEXT:    [[J:%.*]] = phi i64 [ 0, [[LOOP_OUTER]] ], [ [[J_NEXT:%.*]], [[LOOP_INNER_TAIL:%.*]] ]
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr [1001 x [20000 x double]], ptr @Nice, i64 0, i64 [[I]], i64 [[J]]
-; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr [[GEP_1]], align 8
-; CHECK-NEXT:    [[LOAD_1:%.*]] = load <2 x double>, ptr [[GEP_1]], align 8
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr [[GEP_1]], align 16
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load <2 x double>, ptr [[GEP_1]], align 16
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr [1001 x [20001 x double]], ptr @Awkward, i64 0, i64 [[I]], i64 [[J]]
 ; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr [[GEP_2]], align 8
 ; CHECK-NEXT:    [[LOAD_2:%.*]] = load <2 x double>, ptr [[GEP_2]], align 8