diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt index 693f0d0b35edc..9d91100d35b3a 100644 --- a/llvm/lib/Target/NVPTX/CMakeLists.txt +++ b/llvm/lib/Target/NVPTX/CMakeLists.txt @@ -26,6 +26,7 @@ set(NVPTXCodeGen_sources NVPTXISelLowering.cpp NVPTXLowerAggrCopies.cpp NVPTXLowerAlloca.cpp + NVPTXIncreaseAlignment.cpp NVPTXLowerArgs.cpp NVPTXLowerUnreachable.cpp NVPTXMCExpr.cpp diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 77a0e03d4075a..5b2f10be072bb 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -55,6 +55,7 @@ FunctionPass *createNVPTXTagInvariantLoadsPass(); MachineFunctionPass *createNVPTXPeephole(); MachineFunctionPass *createNVPTXProxyRegErasurePass(); MachineFunctionPass *createNVPTXForwardParamsPass(); +FunctionPass *createNVPTXIncreaseLocalAlignmentPass(); void initializeNVVMReflectLegacyPassPass(PassRegistry &); void initializeGenericToNVVMLegacyPassPass(PassRegistry &); @@ -77,6 +78,7 @@ void initializeNVPTXExternalAAWrapperPass(PassRegistry &); void initializeNVPTXPeepholePass(PassRegistry &); void initializeNVPTXTagInvariantLoadLegacyPassPass(PassRegistry &); void initializeNVPTXPrologEpilogPassPass(PassRegistry &); +void initializeNVPTXIncreaseLocalAlignmentLegacyPassPass(PassRegistry &); struct NVVMIntrRangePass : PassInfoMixin { PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); @@ -112,6 +114,11 @@ struct NVPTXTagInvariantLoadsPass : PassInfoMixin { PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; +struct NVPTXIncreaseLocalAlignmentPass + : PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + namespace NVPTX { enum DrvInterface { NVCL, diff --git a/llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp b/llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp new file mode 100644 index 0000000000000..cff2ea25c1e6c --- /dev/null +++ b/llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp @@ -0,0 +1,160 @@ +//===-- NVPTXIncreaseAlignment.cpp - Increase alignment for local arrays --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// A simple pass that looks at local memory allocas that are statically +// sized and potentially increases their alignment. This enables vectorization +// of loads/stores to these allocas if not explicitly specified by the client. +// +// TODO: Ideally we should do a bin-packing of local allocas to maximize +// alignments while minimizing holes. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/NVPTXAddrSpace.h" + +using namespace llvm; + +static cl::opt MinLocalArrayAlignment( + "nvptx-ensure-minimum-local-alignment", cl::init(16), cl::Hidden, + cl::desc( + "Ensure local memory objects are at least this aligned (default 16)")); + +static Align getMaxLocalArrayAlignment(const TargetTransformInfo &TTI) { + const unsigned MaxBitWidth = + TTI.getLoadStoreVecRegBitWidth(NVPTXAS::ADDRESS_SPACE_LOCAL); + return Align(MaxBitWidth / 8); +} + +namespace { +struct NVPTXIncreaseLocalAlignment { + const Align MaxUsableAlign; + + NVPTXIncreaseLocalAlignment(const TargetTransformInfo &TTI) + : MaxUsableAlign(getMaxLocalArrayAlignment(TTI)) {} + + bool run(Function &F); + bool updateAllocaAlignment(AllocaInst *Alloca, const DataLayout &DL); + Align getMaxUsefulArrayAlignment(unsigned ArraySize); + Align getMaxSafeLocalAlignment(unsigned ArraySize); +}; +} // namespace + +/// Get the maximum useful alignment for an allocation. This is more likely to +/// produce holes in the local memory. +/// +/// Choose an alignment large enough that the entire alloca could be loaded +/// with a single vector load (if possible). Cap the alignment at +/// MinLocalArrayAlignment and MaxUsableAlign. +Align NVPTXIncreaseLocalAlignment::getMaxUsefulArrayAlignment( + const unsigned ArraySize) { + const Align UpperLimit = + std::min(MaxUsableAlign, Align(MinLocalArrayAlignment)); + return std::min(UpperLimit, Align(PowerOf2Ceil(ArraySize))); +} + +/// Get the alignment of allocas that reduces the chances of leaving holes when +/// they are allocated within a contiguous memory buffer (like the stack). +/// Holes are still possible before and after the allocation. +/// +/// Choose the largest alignment such that the allocation size is a multiple of +/// the alignment. If all elements of the buffer are allocated in order of +/// alignment (higher to lower) no holes will be left. +Align NVPTXIncreaseLocalAlignment::getMaxSafeLocalAlignment( + const unsigned ArraySize) { + return commonAlignment(MaxUsableAlign, ArraySize); +} + +/// Find a better alignment for local allocas. +bool NVPTXIncreaseLocalAlignment::updateAllocaAlignment(AllocaInst *Alloca, + const DataLayout &DL) { + if (!Alloca->isStaticAlloca()) + return false; + + const auto ArraySize = Alloca->getAllocationSize(DL); + if (!(ArraySize && ArraySize->isFixed())) + return false; + + const auto ArraySizeValue = ArraySize->getFixedValue(); + if (ArraySizeValue == 0) + return false; + + const Align NewAlignment = + std::max(getMaxSafeLocalAlignment(ArraySizeValue), + getMaxUsefulArrayAlignment(ArraySizeValue)); + + if (NewAlignment > Alloca->getAlign()) { + Alloca->setAlignment(NewAlignment); + return true; + } + + return false; +} + +bool NVPTXIncreaseLocalAlignment::run(Function &F) { + bool Changed = false; + const auto &DL = F.getParent()->getDataLayout(); + + BasicBlock &EntryBB = F.getEntryBlock(); + for (Instruction &I : EntryBB) + if (AllocaInst *Alloca = dyn_cast(&I)) + Changed |= updateAllocaAlignment(Alloca, DL); + + return Changed; +} + +namespace { +struct NVPTXIncreaseLocalAlignmentLegacyPass : public FunctionPass { + static char ID; + NVPTXIncreaseLocalAlignmentLegacyPass() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } + StringRef getPassName() const override { + return "NVPTX Increase Local Alignment"; + } +}; +} // namespace + +char NVPTXIncreaseLocalAlignmentLegacyPass::ID = 0; +INITIALIZE_PASS(NVPTXIncreaseLocalAlignmentLegacyPass, + "nvptx-increase-local-alignment", + "Increase alignment for statically sized allocas", false, false) + +FunctionPass *llvm::createNVPTXIncreaseLocalAlignmentPass() { + return new NVPTXIncreaseLocalAlignmentLegacyPass(); +} + +bool NVPTXIncreaseLocalAlignmentLegacyPass::runOnFunction(Function &F) { + const auto &TTI = getAnalysis().getTTI(F); + return NVPTXIncreaseLocalAlignment(TTI).run(F); +} + +PreservedAnalyses +NVPTXIncreaseLocalAlignmentPass::run(Function &F, + FunctionAnalysisManager &FAM) { + const auto &TTI = FAM.getResult(F); + bool Changed = NVPTXIncreaseLocalAlignment(TTI).run(F); + + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet(); + return PA; +} diff --git a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def index ee37c9826012c..827cb7bba7018 100644 --- a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def +++ b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def @@ -40,4 +40,5 @@ FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass()) FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass()) FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this)) FUNCTION_PASS("nvptx-tag-invariant-loads", NVPTXTagInvariantLoadsPass()) +FUNCTION_PASS("nvptx-increase-local-alignment", NVPTXIncreaseLocalAlignmentPass()) #undef FUNCTION_PASS diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 0603994606d71..7426114dd0f89 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -393,6 +393,8 @@ void NVPTXPassConfig::addIRPasses() { // but EarlyCSE can do neither of them. if (getOptLevel() != CodeGenOptLevel::None) { addEarlyCSEOrGVNPass(); + // Increase alignment for local arrays to improve vectorization. + addPass(createNVPTXIncreaseLocalAlignmentPass()); if (!DisableLoadStoreVectorizer) addPass(createLoadStoreVectorizerPass()); addPass(createSROAPass()); diff --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll index 0cd7058174d67..d2504ddd8e76c 100644 --- a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll +++ b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll @@ -20,7 +20,7 @@ define ptx_kernel void @kernel_func(ptr %a) { entry: %buf = alloca [16 x i8], align 4 -; CHECK: .local .align 4 .b8 __local_depot0[16] +; CHECK: .local .align 16 .b8 __local_depot0[16] ; CHECK: mov.b64 %SPL ; CHECK: ld.param.b64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0] diff --git a/llvm/test/CodeGen/NVPTX/increase-local-align.ll b/llvm/test/CodeGen/NVPTX/increase-local-align.ll new file mode 100644 index 0000000000000..6215850a2a22b --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/increase-local-align.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=nvptx-increase-local-alignment -nvptx-ensure-minimum-local-alignment=1 < %s | FileCheck %s --check-prefixes=COMMON,MIN-1 +; RUN: opt -S -passes=nvptx-increase-local-alignment -nvptx-ensure-minimum-local-alignment=8 < %s | FileCheck %s --check-prefixes=COMMON,MIN-8 +; RUN: opt -S -passes=nvptx-increase-local-alignment -nvptx-ensure-minimum-local-alignment=16 < %s | FileCheck %s --check-prefixes=COMMON,MIN-16 +target triple = "nvptx64-nvidia-cuda" + +define void @test1() { +; COMMON-LABEL: define void @test1() { +; COMMON-NEXT: [[A:%.*]] = alloca i8, align 1 +; COMMON-NEXT: ret void +; + %a = alloca i8, align 1 + ret void +} + +define void @test2() { +; MIN-1-LABEL: define void @test2() { +; MIN-1-NEXT: [[A:%.*]] = alloca [63 x i8], align 1 +; MIN-1-NEXT: ret void +; +; MIN-8-LABEL: define void @test2() { +; MIN-8-NEXT: [[A:%.*]] = alloca [63 x i8], align 8 +; MIN-8-NEXT: ret void +; +; MIN-16-LABEL: define void @test2() { +; MIN-16-NEXT: [[A:%.*]] = alloca [63 x i8], align 16 +; MIN-16-NEXT: ret void +; + %a = alloca [63 x i8], align 1 + ret void +} + +define void @test3() { +; COMMON-LABEL: define void @test3() { +; COMMON-NEXT: [[A:%.*]] = alloca [64 x i8], align 16 +; COMMON-NEXT: ret void +; + %a = alloca [64 x i8], align 1 + ret void +} + +define void @test4() { +; MIN-1-LABEL: define void @test4() { +; MIN-1-NEXT: [[A:%.*]] = alloca i8, i32 63, align 1 +; MIN-1-NEXT: ret void +; +; MIN-8-LABEL: define void @test4() { +; MIN-8-NEXT: [[A:%.*]] = alloca i8, i32 63, align 8 +; MIN-8-NEXT: ret void +; +; MIN-16-LABEL: define void @test4() { +; MIN-16-NEXT: [[A:%.*]] = alloca i8, i32 63, align 16 +; MIN-16-NEXT: ret void +; + %a = alloca i8, i32 63, align 1 + ret void +} + +define void @test5() { +; COMMON-LABEL: define void @test5() { +; COMMON-NEXT: [[A:%.*]] = alloca i8, i32 64, align 16 +; COMMON-NEXT: ret void +; + %a = alloca i8, i32 64, align 1 + ret void +} + +define void @test6() { +; COMMON-LABEL: define void @test6() { +; COMMON-NEXT: [[A:%.*]] = alloca i8, align 32 +; COMMON-NEXT: ret void +; + %a = alloca i8, align 32 + ret void +} + +define void @test7() { +; COMMON-LABEL: define void @test7() { +; COMMON-NEXT: [[A:%.*]] = alloca i32, align 4 +; COMMON-NEXT: ret void +; + %a = alloca i32, align 2 + ret void +} + +define void @test8() { +; COMMON-LABEL: define void @test8() { +; COMMON-NEXT: [[A:%.*]] = alloca [2 x i32], align 8 +; COMMON-NEXT: ret void +; + %a = alloca [2 x i32], align 2 + ret void +} + +define void @test9() { +; COMMON-LABEL: define void @test9() { +; COMMON-NEXT: [[A:%.*]] = alloca [0 x i32], align 1 +; COMMON-NEXT: ret void +; + %a = alloca [0 x i32], align 1 + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll index 5c3017310d0a3..3899b37e140eb 100644 --- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll +++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll @@ -94,7 +94,7 @@ declare void @bar(ptr %a) define void @foo3(i32 %a) { ; PTX32-LABEL: foo3( ; PTX32: { -; PTX32-NEXT: .local .align 4 .b8 __local_depot2[12]; +; PTX32-NEXT: .local .align 16 .b8 __local_depot2[16]; ; PTX32-NEXT: .reg .b32 %SP; ; PTX32-NEXT: .reg .b32 %SPL; ; PTX32-NEXT: .reg .b32 %r<6>; @@ -110,7 +110,7 @@ define void @foo3(i32 %a) { ; ; PTX64-LABEL: foo3( ; PTX64: { -; PTX64-NEXT: .local .align 4 .b8 __local_depot2[12]; +; PTX64-NEXT: .local .align 16 .b8 __local_depot2[16]; ; PTX64-NEXT: .reg .b64 %SP; ; PTX64-NEXT: .reg .b64 %SPL; ; PTX64-NEXT: .reg .b32 %r<2>; diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index 4784d7093a796..4047579eb4ea3 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -135,7 +135,7 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out ; ; PTX-LABEL: escape_ptr( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot2[8]; +; PTX-NEXT: .local .align 8 .b8 __local_depot2[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; @@ -175,7 +175,7 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone ; ; PTX-LABEL: escape_ptr_gep( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot3[8]; +; PTX-NEXT: .local .align 8 .b8 __local_depot3[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; @@ -190,7 +190,7 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone ; PTX-NEXT: st.local.b32 [%rd2+4], %r1; ; PTX-NEXT: ld.param.b32 %r2, [escape_ptr_gep_param_1]; ; PTX-NEXT: st.local.b32 [%rd2], %r2; -; PTX-NEXT: add.s64 %rd3, %rd1, 4; +; PTX-NEXT: or.b64 %rd3, %rd1, 4; ; PTX-NEXT: { // callseq 1, 0 ; PTX-NEXT: .param .b64 param0; ; PTX-NEXT: st.param.b64 [param0], %rd3; @@ -216,7 +216,7 @@ define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeon ; ; PTX-LABEL: escape_ptr_store( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot4[8]; +; PTX-NEXT: .local .align 8 .b8 __local_depot4[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; @@ -254,7 +254,7 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri ; ; PTX-LABEL: escape_ptr_gep_store( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot5[8]; +; PTX-NEXT: .local .align 8 .b8 __local_depot5[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; @@ -271,7 +271,7 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri ; PTX-NEXT: st.local.b32 [%rd4+4], %r1; ; PTX-NEXT: ld.param.b32 %r2, [escape_ptr_gep_store_param_1]; ; PTX-NEXT: st.local.b32 [%rd4], %r2; -; PTX-NEXT: add.s64 %rd5, %rd3, 4; +; PTX-NEXT: or.b64 %rd5, %rd3, 4; ; PTX-NEXT: st.global.b64 [%rd2], %rd5; ; PTX-NEXT: ret; entry: @@ -294,7 +294,7 @@ define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonl ; ; PTX-LABEL: escape_ptrtoint( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot6[8]; +; PTX-NEXT: .local .align 8 .b8 __local_depot6[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll index ad2e7044e93bc..17c74227cfbe6 100644 --- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll +++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll @@ -101,7 +101,7 @@ declare void @llvm.va_end.p0(ptr) define dso_local i32 @foo() { ; CHECK-PTX-LABEL: foo( ; CHECK-PTX: { -; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot1[40]; +; CHECK-PTX-NEXT: .local .align 16 .b8 __local_depot1[48]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; ; CHECK-PTX-NEXT: .reg .b32 %r<3>; @@ -138,7 +138,7 @@ entry: define dso_local i32 @variadics2(i32 noundef %first, ...) { ; CHECK-PTX-LABEL: variadics2( ; CHECK-PTX: { -; CHECK-PTX-NEXT: .local .align 1 .b8 __local_depot2[3]; +; CHECK-PTX-NEXT: .local .align 4 .b8 __local_depot2[4]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; ; CHECK-PTX-NEXT: .reg .b16 %rs<4>; @@ -198,7 +198,7 @@ declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias define dso_local i32 @bar() { ; CHECK-PTX-LABEL: bar( ; CHECK-PTX: { -; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot3[24]; +; CHECK-PTX-NEXT: .local .align 16 .b8 __local_depot3[32]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; ; CHECK-PTX-NEXT: .reg .b16 %rs<4>; @@ -215,10 +215,10 @@ define dso_local i32 @bar() { ; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs2; ; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5]; ; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs3; -; CHECK-PTX-NEXT: st.b32 [%SP+8], 1; -; CHECK-PTX-NEXT: st.b8 [%SP+12], 1; -; CHECK-PTX-NEXT: st.b64 [%SP+16], 1; -; CHECK-PTX-NEXT: add.u64 %rd3, %SP, 8; +; CHECK-PTX-NEXT: st.b32 [%SP+16], 1; +; CHECK-PTX-NEXT: st.b8 [%SP+20], 1; +; CHECK-PTX-NEXT: st.b64 [%SP+24], 1; +; CHECK-PTX-NEXT: add.u64 %rd3, %SP, 16; ; CHECK-PTX-NEXT: { // callseq 1, 0 ; CHECK-PTX-NEXT: .param .b32 param0; ; CHECK-PTX-NEXT: st.param.b32 [param0], 1; @@ -345,7 +345,7 @@ entry: define dso_local void @qux() { ; CHECK-PTX-LABEL: qux( ; CHECK-PTX: { -; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot7[24]; +; CHECK-PTX-NEXT: .local .align 16 .b8 __local_depot7[32]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; ; CHECK-PTX-NEXT: .reg .b32 %r<2>;