Skip to content

Commit

Permalink
[AMDGPU] Support preloading hidden kernel arguments (#98861)
Browse files Browse the repository at this point in the history
Adds hidden kernel arguments to the function signature and marks them
inreg if they should be preloaded into user SGPRs. The normal kernarg
preloading logic then takes over with some additional checks for the
correct implicitarg_ptr alignment.

Special care is needed so that metadata for the hidden arguments is not
added twice when generating the code object.
  • Loading branch information
kerbowa authored Oct 7, 2024
1 parent f2b0133 commit c4d8920
Show file tree
Hide file tree
Showing 10 changed files with 1,101 additions and 9 deletions.
10 changes: 10 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1639,6 +1639,10 @@ The AMDGPU backend supports the following LLVM IR attributes.
function which requires AGPRs is reached through any function marked
with this attribute.

"amdgpu-hidden-argument" This attribute is used internally by the backend to mark function arguments
as hidden. Hidden arguments are managed by the compiler and are not part of
the explicit arguments supplied by the user.

======================================= ==========================================================

Calling Conventions
Expand Down Expand Up @@ -5856,6 +5860,12 @@ may insert a trap instruction at the start of the kernel prologue to manage
situations where kernarg preloading is attempted on hardware with incompatible
firmware.

With code object V5 and later, hidden kernel arguments that are normally
accessed through the Implicit Argument Ptr, may be preloaded into User SGPRs.
These arguments are added to the kernel function signature and are marked with
the attributes "inreg" and "amdgpu-hidden-argument". (See
:ref:`amdgpu-llvm-ir-attributes-table`).

.. _amdgpu-amdhsa-kernel-prolog:

Kernel Prolog
Expand Down
2 changes: 2 additions & 0 deletions llvm/include/llvm/IR/Argument.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,8 @@ class Argument final : public Value {
/// Check if an argument has a given attribute.
bool hasAttribute(Attribute::AttrKind Kind) const;

bool hasAttribute(StringRef Kind) const;

Attribute getAttribute(Attribute::AttrKind Kind) const;

/// Method for support type inquiry through isa, cast, and dyn_cast.
Expand Down
3 changes: 3 additions & 0 deletions llvm/include/llvm/IR/Function.h
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,9 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node<Function> {
/// check if an attributes is in the list of attributes.
bool hasParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const;

/// Check if an attribute is in the list of attributes.
bool hasParamAttribute(unsigned ArgNo, StringRef Kind) const;

/// gets the attribute from the list of attributes.
Attribute getAttributeAtIndex(unsigned i, Attribute::AttrKind Kind) const;

Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/IR/Function.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,10 @@ bool Argument::hasAttribute(Attribute::AttrKind Kind) const {
return getParent()->hasParamAttribute(getArgNo(), Kind);
}

bool Argument::hasAttribute(StringRef Kind) const {
return getParent()->hasParamAttribute(getArgNo(), Kind);
}

Attribute Argument::getAttribute(Attribute::AttrKind Kind) const {
return getParent()->getParamAttribute(getArgNo(), Kind);
}
Expand Down Expand Up @@ -738,6 +742,10 @@ bool Function::hasParamAttribute(unsigned ArgNo,
return AttributeSets.hasParamAttr(ArgNo, Kind);
}

bool Function::hasParamAttribute(unsigned ArgNo, StringRef Kind) const {
return AttributeSets.hasParamAttr(ArgNo, Kind);
}

Attribute Function::getAttributeAtIndex(unsigned i,
Attribute::AttrKind Kind) const {
return AttributeSets.getAttributeAtIndex(i, Kind);
Expand Down
6 changes: 5 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,8 +260,12 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
auto &Func = MF.getFunction();
unsigned Offset = 0;
auto Args = HSAMetadataDoc->getArrayNode();
for (auto &Arg : Func.args())
for (auto &Arg : Func.args()) {
if (Arg.hasAttribute("amdgpu-hidden-argument"))
continue;

emitKernelArg(Arg, Offset, Args);
}

emitHiddenKernelArgs(MF, Offset, Args);

Expand Down
202 changes: 200 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
Expand All @@ -31,9 +33,110 @@ class PreloadKernelArgInfo {
const GCNSubtarget &ST;
unsigned NumFreeUserSGPRs;

public:
SmallVector<llvm::Metadata *, 8> KernelArgMetadata;
enum HiddenArg : unsigned {
HIDDEN_BLOCK_COUNT_X,
HIDDEN_BLOCK_COUNT_Y,
HIDDEN_BLOCK_COUNT_Z,
HIDDEN_GROUP_SIZE_X,
HIDDEN_GROUP_SIZE_Y,
HIDDEN_GROUP_SIZE_Z,
HIDDEN_REMAINDER_X,
HIDDEN_REMAINDER_Y,
HIDDEN_REMAINDER_Z,
END_HIDDEN_ARGS
};

// Stores information about a specific hidden argument.
struct HiddenArgInfo {
// Offset in bytes from the location in the kernearg segment pointed to by
// the implicitarg pointer.
uint8_t Offset;
// The size of the hidden argument in bytes.
uint8_t Size;
// The name of the hidden argument in the kernel signature.
const char *Name;
};

static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
{0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
{8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
{14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
{18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
{22, 2, "_hidden_remainder_z"}};

static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
if (HiddenArgs[I].Offset == Offset)
return static_cast<HiddenArg>(I);

return END_HIDDEN_ARGS;
}

static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
if (HA < END_HIDDEN_ARGS)
return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);

llvm_unreachable("Unexpected hidden argument.");
}

static const char *getHiddenArgName(HiddenArg HA) {
if (HA < END_HIDDEN_ARGS) {
return HiddenArgs[HA].Name;
}
llvm_unreachable("Unexpected hidden argument.");
}

// Clones the function after adding implicit arguments to the argument list
// and returns the new updated function. Preloaded implicit arguments are
// added up to and including the last one that will be preloaded, indicated by
// LastPreloadIndex. Currently preloading is only performed on the totality of
// sequential data from the kernarg segment including implicit (hidden)
// arguments. This means that all arguments up to the last preloaded argument
// will also be preloaded even if that data is unused.
Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
FunctionType *FT = F.getFunctionType();
LLVMContext &Ctx = F.getParent()->getContext();
SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
for (unsigned I = 0; I <= LastPreloadIndex; ++I)
FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));

FunctionType *NFT =
FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
Function *NF =
Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());

NF->copyAttributesFrom(&F);
NF->copyMetadata(&F, 0);
NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);

F.getParent()->getFunctionList().insert(F.getIterator(), NF);
NF->takeName(&F);
NF->splice(NF->begin(), &F);

Function::arg_iterator NFArg = NF->arg_begin();
for (Argument &Arg : F.args()) {
Arg.replaceAllUsesWith(&*NFArg);
NFArg->takeName(&Arg);
++NFArg;
}

AttrBuilder AB(Ctx);
AB.addAttribute(Attribute::InReg);
AB.addAttribute("amdgpu-hidden-argument");
AttributeList AL = NF->getAttributes();
for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
NFArg++->setName(getHiddenArgName(HiddenArg(I)));
}

NF->setAttributes(AL);
F.replaceAllUsesWith(NF);
F.setCallingConv(CallingConv::C);

return NF;
}

public:
PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
setInitialFreeUserSGPRsCount();
}
Expand Down Expand Up @@ -64,6 +167,87 @@ class PreloadKernelArgInfo {
NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
return true;
}

// Try to allocate SGPRs to preload implicit kernel arguments.
void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
IRBuilder<> &Builder) {
StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
Function *ImplicitArgPtr = F.getParent()->getFunction(Name);
if (!ImplicitArgPtr)
return;

const DataLayout &DL = F.getParent()->getDataLayout();
// Pair is the load and the load offset.
SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
for (auto *U : ImplicitArgPtr->users()) {
Instruction *CI = dyn_cast<Instruction>(U);
if (!CI || CI->getParent()->getParent() != &F)
continue;

for (auto *U : CI->users()) {
int64_t Offset = 0;
auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
if (!Load) {
if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
continue;

Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
}

if (!Load || !Load->isSimple())
continue;

// FIXME: Expand to handle 64-bit implicit args and large merged loads.
LLVMContext &Ctx = F.getParent()->getContext();
Type *LoadTy = Load->getType();
HiddenArg HA = getHiddenArgFromOffset(Offset);
if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
continue;

ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
}
}

if (ImplicitArgLoads.empty())
return;

// Allocate loads in order of offset. We need to be sure that the implicit
// argument can actually be preloaded.
std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());

uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
// If we fail to preload any implicit argument we know we don't have SGPRs
// to preload any subsequent ones with larger offsets. Find the first
// argument that we cannot preload.
auto *PreloadEnd = std::find_if(
ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
[&](const std::pair<LoadInst *, unsigned> &Load) {
unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
unsigned LoadOffset = Load.second;
if (!tryAllocPreloadSGPRs(LoadSize,
LoadOffset + ImplicitArgsBaseOffset,
LastExplicitArgOffset))
return true;

LastExplicitArgOffset = LoadOffset + LoadSize;
return false;
});

if (PreloadEnd == ImplicitArgLoads.begin())
return;

unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
assert(NF);
for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
LoadInst *LoadInst = I->first;
unsigned LoadOffset = I->second;
unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
Argument *Arg = NF->getArg(Index);
LoadInst->replaceAllUsesWith(Arg);
}
}
};

class AMDGPULowerKernelArguments : public FunctionPass {
Expand Down Expand Up @@ -142,6 +326,12 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
uint64_t LastExplicitArgOffset = ExplicitArgOffset;
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;

// Guard against the situation where hidden arguments have already been
// lowered and added to the kernel function signiture, i.e. in a situation
// where this pass has run twice.
if (Arg.hasAttribute("amdgpu-hidden-argument"))
break;

// Try to preload this argument into user SGPRs.
if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
!Arg.getType()->isAggregateType())
Expand Down Expand Up @@ -281,6 +471,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
KernArgSegment->addRetAttr(
Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));

if (InPreloadSequence) {
uint64_t ImplicitArgsBaseOffset =
alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
BaseOffset;
PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
Builder);
}

return true;
}

Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,9 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
MaxAlign = Align(1);

for (const Argument &Arg : F.args()) {
if (Arg.hasAttribute("amdgpu-hidden-argument"))
continue;

const bool IsByRef = Arg.hasByRefAttr();
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
Align Alignment = DL.getValueOrABITypeAlignment(
Expand Down
25 changes: 19 additions & 6 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2510,24 +2510,25 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
Function &F = MF.getFunction();
unsigned LastExplicitArgOffset =
MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
bool InPreloadSequence = true;
unsigned InIdx = 0;
bool AlignedForImplictArgs = false;
unsigned ImplicitArgOffset = 0;
for (auto &Arg : F.args()) {
if (!InPreloadSequence || !Arg.hasInRegAttr())
break;

int ArgIdx = Arg.getArgNo();
unsigned ArgIdx = Arg.getArgNo();
// Don't preload non-original args or parts not in the current preload
// sequence.
if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
(int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
if (InIdx < Ins.size() &&
(!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
break;

for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
(int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
Ins[InIdx].getOrigArgIndex() == ArgIdx;
InIdx++) {
assert(ArgLocs[ArgIdx].isMemLoc());
auto &ArgLoc = ArgLocs[InIdx];
Expand All @@ -2537,6 +2538,18 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
unsigned NumAllocSGPRs =
alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;

// Fix alignment for hidden arguments.
if (Arg.hasAttribute("amdgpu-hidden-argument")) {
if (!AlignedForImplictArgs) {
ImplicitArgOffset =
alignTo(LastExplicitArgOffset,
Subtarget->getAlignmentForImplicitArgPtr()) -
LastExplicitArgOffset;
AlignedForImplictArgs = true;
}
ArgOffset += ImplicitArgOffset;
}

// Arg is preloaded into the previous SGPR.
if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
Expand Down
Loading

0 comments on commit c4d8920

Please sign in to comment.