Skip to content

Commit

Permalink
Merged main:bb7143f6669345825c214b26fbe336857f4bf523 into amd-gfx:a7f…
Browse files Browse the repository at this point in the history
…5b4f85e9d

Local branch amd-gfx a7f5b4f Merged main:ba976971898d74df38d155c55e008c898120d1e4 into amd-gfx:77eaf56bc521
Remote branch main bb7143f AMDGPU: Avoid creating unnecessary block split in atomic expansion (llvm#102440)
  • Loading branch information
SC llvm team authored and SC llvm team committed Aug 11, 2024
2 parents a7f5b4f + bb7143f commit 857e6d3
Show file tree
Hide file tree
Showing 16 changed files with 63 additions and 234 deletions.
12 changes: 7 additions & 5 deletions libc/benchmarks/gpu/LibcGpuBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,14 @@ void print_header() {
LIBC_NAMESPACE::printf("Running Suite: %-10s\n",
benchmarks[0]->get_suite_name().data());
LIBC_NAMESPACE::printf("%s", RESET);
LIBC_NAMESPACE::printf(
cpp::string titles =
"Benchmark | Cycles | Min | Max | "
"Iterations | Time / Iteration | Stddev | Threads |\n");
LIBC_NAMESPACE::printf(
"---------------------------------------------------------------------"
"--------------------------------\n");
"Iterations | Time / Iteration | Stddev | Threads |\n";
LIBC_NAMESPACE::printf(titles.data());

cpp::string separator(titles.size(), '-');
separator[titles.size() - 1] = '\n';
LIBC_NAMESPACE::printf(separator.data());
}

void Benchmark::run_benchmarks() {
Expand Down
8 changes: 4 additions & 4 deletions libc/benchmarks/gpu/src/math/sin_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ BENCH(NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
#endif

#ifdef AMDGPU_MATH_FOUND
BENCH(AmdgpuSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
BENCH(AmdgpuSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
BENCH(AmdgpuSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
BENCH(AmdgpuSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
BENCH(AmdSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
BENCH(AmdSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
BENCH(AmdSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
BENCH(AmdSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
#endif
10 changes: 9 additions & 1 deletion libc/newhdrgen/yaml_to_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,15 @@ def add_function_to_yaml(yaml_file, function_details):
if new_function.attributes:
function_dict["attributes"] = new_function.attributes

yaml_data["functions"].append(function_dict)
insert_index = 0
for i, func in enumerate(yaml_data["functions"]):
if func["name"] > new_function.name:
insert_index = i
break
else:
insert_index = len(yaml_data["functions"])

yaml_data["functions"].insert(insert_index, function_dict)

class IndentYamlListDumper(yaml.Dumper):
def increase_indent(self, flow=False, indentless=False):
Expand Down
4 changes: 0 additions & 4 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -3843,10 +3843,6 @@ class TargetLowering : public TargetLoweringBase {
/// returned value is a member of the MachineJumpTableInfo::JTEntryKind enum.
virtual unsigned getJumpTableEncoding() const;

virtual MVT getJumpTableRegTy(const DataLayout &DL) const {
return getPointerTy(DL);
}

virtual const MCExpr *
LowerCustomJumpTableEntry(const MachineJumpTableInfo * /*MJTI*/,
const MachineBasicBlock * /*MBB*/, unsigned /*uid*/,
Expand Down
2 changes: 1 addition & 1 deletion llvm/include/llvm/Config/llvm-config.h.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

/* Indicate that this is LLVM compiled from the amd-gfx branch. */
#define LLVM_HAVE_BRANCH_AMD_GFX
#define LLVM_MAIN_REVISION 507869
#define LLVM_MAIN_REVISION 507873

/* Define if LLVM_ENABLE_DUMP is enabled */
#cmakedefine LLVM_ENABLE_DUMP
Expand Down
11 changes: 5 additions & 6 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2977,7 +2977,7 @@ void SelectionDAGBuilder::visitJumpTable(SwitchCG::JumpTable &JT) {
// Emit the code for the jump table
assert(JT.SL && "Should set SDLoc for SelectionDAG!");
assert(JT.Reg != -1U && "Should lower JT Header first!");
EVT PTy = DAG.getTargetLoweringInfo().getJumpTableRegTy(DAG.getDataLayout());
EVT PTy = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
SDValue Index = DAG.getCopyFromReg(getControlRoot(), *JT.SL, JT.Reg, PTy);
SDValue Table = DAG.getJumpTable(JT.JTI, PTy);
SDValue BrJumpTable = DAG.getNode(ISD::BR_JT, *JT.SL, MVT::Other,
Expand Down Expand Up @@ -3005,13 +3005,12 @@ void SelectionDAGBuilder::visitJumpTableHeader(SwitchCG::JumpTable &JT,
// This value may be smaller or larger than the target's pointer type, and
// therefore require extension or truncating.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SwitchOp =
DAG.getZExtOrTrunc(Sub, dl, TLI.getJumpTableRegTy(DAG.getDataLayout()));
SwitchOp = DAG.getZExtOrTrunc(Sub, dl, TLI.getPointerTy(DAG.getDataLayout()));

unsigned JumpTableReg =
FuncInfo.CreateReg(TLI.getJumpTableRegTy(DAG.getDataLayout()));
SDValue CopyTo =
DAG.getCopyToReg(getControlRoot(), dl, JumpTableReg, SwitchOp);
FuncInfo.CreateReg(TLI.getPointerTy(DAG.getDataLayout()));
SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), dl,
JumpTableReg, SwitchOp);
JT.Reg = JumpTableReg;

if (!JTH.FallthroughUnreachable) {
Expand Down
8 changes: 0 additions & 8 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16655,9 +16655,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
//
// With this expansion we produce the following code:
// [...]
// br label %atomicrmw.check.shared
//
// atomicrmw.check.shared:
// %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
// br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
//
Expand Down Expand Up @@ -16700,8 +16697,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
Function *F = BB->getParent();
BasicBlock *ExitBB =
BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
BasicBlock *CheckSharedBB =
BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
BasicBlock *CheckPrivateBB =
BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
Expand All @@ -16728,9 +16723,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {

std::prev(BB->end())->eraseFromParent();
Builder.SetInsertPoint(BB);
Builder.CreateBr(CheckSharedBB);

Builder.SetInsertPoint(CheckSharedBB);
CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
{Addr}, nullptr, "is.shared");
Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
Expand Down
45 changes: 3 additions & 42 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
Expand Down Expand Up @@ -583,7 +582,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::ROTR, MVT::i8, Expand);
setOperationAction(ISD::BSWAP, MVT::i16, Expand);

setOperationAction(ISD::BR_JT, MVT::Other, Custom);
// Indirect branch is not supported.
// This also disables Jump Table creation.
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::BRIND, MVT::Other, Expand);

setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
Expand Down Expand Up @@ -944,9 +945,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::Dummy)
MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED)
MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED)
MAKE_CASE(NVPTXISD::BrxEnd)
MAKE_CASE(NVPTXISD::BrxItem)
MAKE_CASE(NVPTXISD::BrxStart)
MAKE_CASE(NVPTXISD::Tex1DFloatS32)
MAKE_CASE(NVPTXISD::Tex1DFloatFloat)
MAKE_CASE(NVPTXISD::Tex1DFloatFloatLevel)
Expand Down Expand Up @@ -2787,8 +2785,6 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerFP_ROUND(Op, DAG);
case ISD::FP_EXTEND:
return LowerFP_EXTEND(Op, DAG);
case ISD::BR_JT:
return LowerBR_JT(Op, DAG);
case ISD::VAARG:
return LowerVAARG(Op, DAG);
case ISD::VASTART:
Expand All @@ -2814,41 +2810,6 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
}
}

SDValue NVPTXTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
SDValue Index = Op.getOperand(2);

unsigned JId = JT->getIndex();
MachineJumpTableInfo *MJTI = DAG.getMachineFunction().getJumpTableInfo();
ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;

SDValue IdV = DAG.getConstant(JId, DL, MVT::i32);

// Generate BrxStart node
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
Chain = DAG.getNode(NVPTXISD::BrxStart, DL, VTs, Chain, IdV);

// Generate BrxItem nodes
assert(!MBBs.empty());
for (MachineBasicBlock *MBB : MBBs.drop_back())
Chain = DAG.getNode(NVPTXISD::BrxItem, DL, VTs, Chain.getValue(0),
DAG.getBasicBlock(MBB), Chain.getValue(1));

// Generate BrxEnd nodes
SDValue EndOps[] = {Chain.getValue(0), DAG.getBasicBlock(MBBs.back()), Index,
IdV, Chain.getValue(1)};
SDValue BrxEnd = DAG.getNode(NVPTXISD::BrxEnd, DL, VTs, EndOps);

return BrxEnd;
}

// This will prevent AsmPrinter from trying to print the jump tables itself.
unsigned NVPTXTargetLowering::getJumpTableEncoding() const {
return MachineJumpTableInfo::EK_Inline;
}

// This function is almost a copy of SelectionDAG::expandVAArg().
// The only diff is that this one produces loads from local address space.
SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
Expand Down
10 changes: 0 additions & 10 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,6 @@ enum NodeType : unsigned {
BFI,
PRMT,
DYNAMIC_STACKALLOC,
BrxStart,
BrxItem,
BrxEnd,
Dummy,

LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
Expand Down Expand Up @@ -583,11 +580,6 @@ class NVPTXTargetLowering : public TargetLowering {
return true;
}

// The default is the same as pointer type, but brx.idx only accepts i32
MVT getJumpTableRegTy(const DataLayout &) const override { return MVT::i32; }

unsigned getJumpTableEncoding() const override;

bool enableAggressiveFMAFusion(EVT VT) const override { return true; }

// The default is to transform llvm.ctlz(x, false) (where false indicates that
Expand Down Expand Up @@ -645,8 +637,6 @@ class NVPTXTargetLowering : public TargetLowering {

SDValue LowerSelect(SDValue Op, SelectionDAG &DAG) const;

SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;

SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;

Expand Down
38 changes: 0 additions & 38 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -3880,44 +3880,6 @@ def DYNAMIC_STACKALLOC64 :
[(set Int64Regs:$ptr, (dyn_alloca Int64Regs:$size, (i32 timm:$align)))]>,
Requires<[hasPTX<73>, hasSM<52>]>;


//
// BRX
//

def SDTBrxStartProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
def SDTBrxItemProfile : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
def SDTBrxEndProfile : SDTypeProfile<0, 3, [SDTCisVT<0, OtherVT>, SDTCisInt<1>, SDTCisInt<2>]>;

def brx_start :
SDNode<"NVPTXISD::BrxStart", SDTBrxStartProfile,
[SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
def brx_item :
SDNode<"NVPTXISD::BrxItem", SDTBrxItemProfile,
[SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
def brx_end :
SDNode<"NVPTXISD::BrxEnd", SDTBrxEndProfile,
[SDNPHasChain, SDNPInGlue, SDNPSideEffect]>;

let isTerminator = 1, isBranch = 1, isIndirectBranch = 1 in {

def BRX_START :
NVPTXInst<(outs), (ins i32imm:$id),
"$$L_brx_$id: .branchtargets",
[(brx_start (i32 imm:$id))]>;

def BRX_ITEM :
NVPTXInst<(outs), (ins brtarget:$target),
"\t$target,",
[(brx_item bb:$target)]>;

def BRX_END :
NVPTXInst<(outs), (ins brtarget:$target, Int32Regs:$val, i32imm:$id),
"\t$target;\n\tbrx.idx \t$val, $$L_brx_$id;",
[(brx_end bb:$target, (i32 Int32Regs:$val), (i32 imm:$id))]>;
}


include "NVPTXIntrinsics.td"

//-----------------------------------
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {

define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
; GFX908-LABEL: syncscope_workgroup_nortn:
; GFX908: ; %bb.0: ; %atomicrmw.check.shared
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base
; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
Expand Down Expand Up @@ -272,7 +272,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: syncscope_workgroup_nortn:
; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
Expand Down
Loading

0 comments on commit 857e6d3

Please sign in to comment.