[AMDGPU] Support preloading hidden kernel arguments (#98861)

Adds hidden kernel arguments to the function signature and marks them inreg if they should be preloaded into user SGPRs. The normal kernarg preloading logic then takes over with some additional checks for the correct implicitarg_ptr alignment. Special care is needed so that metadata for the hidden arguments is not added twice when generating the code object.
llvm · Oct 7, 2024 · c4d8920 · c4d8920
1 parent f2b0133
commit c4d8920
Show file tree

Hide file tree

Showing 10 changed files with 1,101 additions and 9 deletions.
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
@@ -1639,6 +1639,10 @@ The AMDGPU backend supports the following LLVM IR attributes.
                                              function which requires AGPRs is reached through any function marked
                                              with this attribute.
 
+     "amdgpu-hidden-argument"                This attribute is used internally by the backend to mark function arguments
+                                             as hidden. Hidden arguments are managed by the compiler and are not part of
+                                             the explicit arguments supplied by the user.
+
      ======================================= ==========================================================
 
 Calling Conventions
@@ -5856,6 +5860,12 @@ may insert a trap instruction at the start of the kernel prologue to manage
 situations where kernarg preloading is attempted on hardware with incompatible
 firmware.
 
+With code object V5 and later, hidden kernel arguments that are normally
+accessed through the Implicit Argument Ptr, may be preloaded into User SGPRs.
+These arguments are added to the kernel function signature and are marked with
+the attributes "inreg" and "amdgpu-hidden-argument". (See
+:ref:`amdgpu-llvm-ir-attributes-table`).
+
 .. _amdgpu-amdhsa-kernel-prolog:
 
 Kernel Prolog

diff --git a/llvm/include/llvm/IR/Argument.h b/llvm/include/llvm/IR/Argument.h
@@ -178,6 +178,8 @@ class Argument final : public Value {
   /// Check if an argument has a given attribute.
   bool hasAttribute(Attribute::AttrKind Kind) const;
 
+  bool hasAttribute(StringRef Kind) const;
+
   Attribute getAttribute(Attribute::AttrKind Kind) const;
 
   /// Method for support type inquiry through isa, cast, and dyn_cast.

diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
@@ -433,6 +433,9 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node<Function> {
   /// check if an attributes is in the list of attributes.
   bool hasParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const;
 
+  /// Check if an attribute is in the list of attributes.
+  bool hasParamAttribute(unsigned ArgNo, StringRef Kind) const;
+
   /// gets the attribute from the list of attributes.
   Attribute getAttributeAtIndex(unsigned i, Attribute::AttrKind Kind) const;
 

diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
@@ -351,6 +351,10 @@ bool Argument::hasAttribute(Attribute::AttrKind Kind) const {
   return getParent()->hasParamAttribute(getArgNo(), Kind);
 }
 
+bool Argument::hasAttribute(StringRef Kind) const {
+  return getParent()->hasParamAttribute(getArgNo(), Kind);
+}
+
 Attribute Argument::getAttribute(Attribute::AttrKind Kind) const {
   return getParent()->getParamAttribute(getArgNo(), Kind);
 }
@@ -738,6 +742,10 @@ bool Function::hasParamAttribute(unsigned ArgNo,
   return AttributeSets.hasParamAttr(ArgNo, Kind);
 }
 
+bool Function::hasParamAttribute(unsigned ArgNo, StringRef Kind) const {
+  return AttributeSets.hasParamAttr(ArgNo, Kind);
+}
+
 Attribute Function::getAttributeAtIndex(unsigned i,
                                         Attribute::AttrKind Kind) const {
   return AttributeSets.getAttributeAtIndex(i, Kind);

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -260,8 +260,12 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
   auto &Func = MF.getFunction();
   unsigned Offset = 0;
   auto Args = HSAMetadataDoc->getArrayNode();
-  for (auto &Arg : Func.args())
+  for (auto &Arg : Func.args()) {
+    if (Arg.hasAttribute("amdgpu-hidden-argument"))
+      continue;
+
     emitKernelArg(Arg, Offset, Args);
+  }
 
   emitHiddenKernelArgs(MF, Offset, Args);
 

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -13,6 +13,8 @@
 
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -31,9 +33,110 @@ class PreloadKernelArgInfo {
   const GCNSubtarget &ST;
   unsigned NumFreeUserSGPRs;
 
-public:
-  SmallVector<llvm::Metadata *, 8> KernelArgMetadata;
+  enum HiddenArg : unsigned {
+    HIDDEN_BLOCK_COUNT_X,
+    HIDDEN_BLOCK_COUNT_Y,
+    HIDDEN_BLOCK_COUNT_Z,
+    HIDDEN_GROUP_SIZE_X,
+    HIDDEN_GROUP_SIZE_Y,
+    HIDDEN_GROUP_SIZE_Z,
+    HIDDEN_REMAINDER_X,
+    HIDDEN_REMAINDER_Y,
+    HIDDEN_REMAINDER_Z,
+    END_HIDDEN_ARGS
+  };
+
+  // Stores information about a specific hidden argument.
+  struct HiddenArgInfo {
+    // Offset in bytes from the location in the kernearg segment pointed to by
+    // the implicitarg pointer.
+    uint8_t Offset;
+    // The size of the hidden argument in bytes.
+    uint8_t Size;
+    // The name of the hidden argument in the kernel signature.
+    const char *Name;
+  };
+
+  static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
+      {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
+      {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
+      {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
+      {18, 2, "_hidden_remainder_x"},  {20, 2, "_hidden_remainder_y"},
+      {22, 2, "_hidden_remainder_z"}};
+
+  static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
+    for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
+      if (HiddenArgs[I].Offset == Offset)
+        return static_cast<HiddenArg>(I);
+
+    return END_HIDDEN_ARGS;
+  }
+
+  static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
+    if (HA < END_HIDDEN_ARGS)
+      return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
+
+    llvm_unreachable("Unexpected hidden argument.");
+  }
+
+  static const char *getHiddenArgName(HiddenArg HA) {
+    if (HA < END_HIDDEN_ARGS) {
+      return HiddenArgs[HA].Name;
+    }
+    llvm_unreachable("Unexpected hidden argument.");
+  }
 
+  // Clones the function after adding implicit arguments to the argument list
+  // and returns the new updated function. Preloaded implicit arguments are
+  // added up to and including the last one that will be preloaded, indicated by
+  // LastPreloadIndex. Currently preloading is only performed on the totality of
+  // sequential data from the kernarg segment including implicit (hidden)
+  // arguments. This means that all arguments up to the last preloaded argument
+  // will also be preloaded even if that data is unused.
+  Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
+    FunctionType *FT = F.getFunctionType();
+    LLVMContext &Ctx = F.getParent()->getContext();
+    SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
+    for (unsigned I = 0; I <= LastPreloadIndex; ++I)
+      FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
+
+    FunctionType *NFT =
+        FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
+    Function *NF =
+        Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
+
+    NF->copyAttributesFrom(&F);
+    NF->copyMetadata(&F, 0);
+    NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
+
+    F.getParent()->getFunctionList().insert(F.getIterator(), NF);
+    NF->takeName(&F);
+    NF->splice(NF->begin(), &F);
+
+    Function::arg_iterator NFArg = NF->arg_begin();
+    for (Argument &Arg : F.args()) {
+      Arg.replaceAllUsesWith(&*NFArg);
+      NFArg->takeName(&Arg);
+      ++NFArg;
+    }
+
+    AttrBuilder AB(Ctx);
+    AB.addAttribute(Attribute::InReg);
+    AB.addAttribute("amdgpu-hidden-argument");
+    AttributeList AL = NF->getAttributes();
+    for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
+      AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
+      NFArg++->setName(getHiddenArgName(HiddenArg(I)));
+    }
+
+    NF->setAttributes(AL);
+    F.replaceAllUsesWith(NF);
+    F.setCallingConv(CallingConv::C);
+
+    return NF;
+  }
+
+public:
   PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
     setInitialFreeUserSGPRsCount();
   }
@@ -64,6 +167,87 @@ class PreloadKernelArgInfo {
     NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
     return true;
   }
+
+  // Try to allocate SGPRs to preload implicit kernel arguments.
+  void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
+                                       IRBuilder<> &Builder) {
+    StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
+    Function *ImplicitArgPtr = F.getParent()->getFunction(Name);
+    if (!ImplicitArgPtr)
+      return;
+
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    // Pair is the load and the load offset.
+    SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
+    for (auto *U : ImplicitArgPtr->users()) {
+      Instruction *CI = dyn_cast<Instruction>(U);
+      if (!CI || CI->getParent()->getParent() != &F)
+        continue;
+
+      for (auto *U : CI->users()) {
+        int64_t Offset = 0;
+        auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
+        if (!Load) {
+          if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
+            continue;
+
+          Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
+        }
+
+        if (!Load || !Load->isSimple())
+          continue;
+
+        // FIXME: Expand to handle 64-bit implicit args and large merged loads.
+        LLVMContext &Ctx = F.getParent()->getContext();
+        Type *LoadTy = Load->getType();
+        HiddenArg HA = getHiddenArgFromOffset(Offset);
+        if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
+          continue;
+
+        ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
+      }
+    }
+
+    if (ImplicitArgLoads.empty())
+      return;
+
+    // Allocate loads in order of offset. We need to be sure that the implicit
+    // argument can actually be preloaded.
+    std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());
+
+    uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
+    // If we fail to preload any implicit argument we know we don't have SGPRs
+    // to preload any subsequent ones with larger offsets. Find the first
+    // argument that we cannot preload.
+    auto *PreloadEnd = std::find_if(
+        ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
+        [&](const std::pair<LoadInst *, unsigned> &Load) {
+          unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
+          unsigned LoadOffset = Load.second;
+          if (!tryAllocPreloadSGPRs(LoadSize,
+                                    LoadOffset + ImplicitArgsBaseOffset,
+                                    LastExplicitArgOffset))
+            return true;
+
+          LastExplicitArgOffset = LoadOffset + LoadSize;
+          return false;
+        });
+
+    if (PreloadEnd == ImplicitArgLoads.begin())
+      return;
+
+    unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
+    Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
+    assert(NF);
+    for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
+      LoadInst *LoadInst = I->first;
+      unsigned LoadOffset = I->second;
+      unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
+      unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
+      Argument *Arg = NF->getArg(Index);
+      LoadInst->replaceAllUsesWith(Arg);
+    }
+  }
 };
 
 class AMDGPULowerKernelArguments : public FunctionPass {
@@ -142,6 +326,12 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
     uint64_t LastExplicitArgOffset = ExplicitArgOffset;
     ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
 
+    // Guard against the situation where hidden arguments have already been
+    // lowered and added to the kernel function signiture, i.e. in a situation
+    // where this pass has run twice.
+    if (Arg.hasAttribute("amdgpu-hidden-argument"))
+      break;
+
     // Try to preload this argument into user SGPRs.
     if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
         !Arg.getType()->isAggregateType())
@@ -281,6 +471,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
   KernArgSegment->addRetAttr(
       Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
 
+  if (InPreloadSequence) {
+    uint64_t ImplicitArgsBaseOffset =
+        alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
+        BaseOffset;
+    PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
+                                                Builder);
+  }
+
   return true;
 }
 

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -314,6 +314,9 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
   MaxAlign = Align(1);
 
   for (const Argument &Arg : F.args()) {
+    if (Arg.hasAttribute("amdgpu-hidden-argument"))
+      continue;
+
     const bool IsByRef = Arg.hasByRefAttr();
     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
     Align Alignment = DL.getValueOrABITypeAlignment(

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2510,24 +2510,25 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
     const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
     const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
   Function &F = MF.getFunction();
-  unsigned LastExplicitArgOffset =
-      MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
+  unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
   GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
   bool InPreloadSequence = true;
   unsigned InIdx = 0;
+  bool AlignedForImplictArgs = false;
+  unsigned ImplicitArgOffset = 0;
   for (auto &Arg : F.args()) {
     if (!InPreloadSequence || !Arg.hasInRegAttr())
       break;
 
-    int ArgIdx = Arg.getArgNo();
+    unsigned ArgIdx = Arg.getArgNo();
     // Don't preload non-original args or parts not in the current preload
     // sequence.
-    if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
-                               (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
+    if (InIdx < Ins.size() &&
+        (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
       break;
 
     for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
-           (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
+           Ins[InIdx].getOrigArgIndex() == ArgIdx;
          InIdx++) {
       assert(ArgLocs[ArgIdx].isMemLoc());
       auto &ArgLoc = ArgLocs[InIdx];
@@ -2537,6 +2538,18 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
       unsigned NumAllocSGPRs =
           alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
 
+      // Fix alignment for hidden arguments.
+      if (Arg.hasAttribute("amdgpu-hidden-argument")) {
+        if (!AlignedForImplictArgs) {
+          ImplicitArgOffset =
+              alignTo(LastExplicitArgOffset,
+                      Subtarget->getAlignmentForImplicitArgPtr()) -
+              LastExplicitArgOffset;
+          AlignedForImplictArgs = true;
+        }
+        ArgOffset += ImplicitArgOffset;
+      }
+
       // Arg is preloaded into the previous SGPR.
       if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
         Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(