Skip to content

AMDGPU: Support V_PK_ADD_{MIN|MAX}_{I|U}16 and V_{MIN|MAX}3_{I|U}16 on gfx1250 #150155

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2519,6 +2519,14 @@ def HasFmaakFmamkF64Insts :
Predicate<"Subtarget->hasFmaakFmamkF64Insts()">,
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;

def HasPkAddMinMaxInsts :
Predicate<"Subtarget->hasPkAddMinMaxInsts()">,
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;

def HasPkMinMax3Insts :
Predicate<"Subtarget->hasPkMinMax3Insts()">,
AssemblerPredicate<(any_of FeatureGFX1250Insts)>;

def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">,
AssemblerPredicate<(all_of FeatureImageInsts)>;

Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1500,6 +1500,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool hasVOPD3() const { return GFX1250Insts; }

// \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }

// \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
bool hasPkMinMax3Insts() const { return GFX1250Insts; }

// \returns true if target has S_SETPRIO_INC_WG instruction.
bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }

Expand Down
54 changes: 54 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3PInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,49 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F
defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
}

def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
let HasModifiers = 0;
}

let isCommutable = 1, isReMaterializable = 1 in {
let SubtargetPredicate = HasPkAddMinMaxInsts in {
defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile>;
defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile>;
defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile>;
defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile>;
}
let SubtargetPredicate = HasPkMinMax3Insts in {
defm V_PK_MAX3_I16 : VOP3PInst<"v_pk_max3_i16", PK_ADD_MINMAX_Profile>;
defm V_PK_MAX3_U16 : VOP3PInst<"v_pk_max3_u16", PK_ADD_MINMAX_Profile>;
defm V_PK_MIN3_I16 : VOP3PInst<"v_pk_min3_i16", PK_ADD_MINMAX_Profile>;
defm V_PK_MIN3_U16 : VOP3PInst<"v_pk_min3_u16", PK_ADD_MINMAX_Profile>;
}
} // End isCommutable = 1, isReMaterializable = 1

// TODO: Extend pattern to select op_sel and op_sel_hi.
class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2,
VOP3P_Pseudo inst,
ValueType vt = inst.Pfl.Src0VT,
RegisterOperand RC = getVCSrcForVT<vt>.ret> : GCNPat <
(ThreeOpFrag<op1, op2> vt:$src0, vt:$src1, vt:$src2),
(inst SRCMODS.OP_SEL_1, RC:$src0, SRCMODS.OP_SEL_1, RC:$src1,
SRCMODS.OP_SEL_1, RC:$src2, DSTCLAMP.NONE, 0)
>;

let SubtargetPredicate = HasPkAddMinMaxInsts in {
def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>;
def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>;
def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>;
def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>;
}

let SubtargetPredicate = HasPkMinMax3Insts in {
def : ThreeOp_OpSelClampPats<smax, smax, V_PK_MAX3_I16>;
def : ThreeOp_OpSelClampPats<umax, umax, V_PK_MAX3_U16>;
def : ThreeOp_OpSelClampPats<smin, smin, V_PK_MIN3_I16>;
def : ThreeOp_OpSelClampPats<umin, umin, V_PK_MIN3_U16>;
}

// Defines patterns that extract signed 4bit from each Idx[0].
foreach Idx = [[0,28],[4,24],[8,20],[12,16],[16,12],[20,8],[24,4]] in
def ExtractSigned4bit_#Idx[0] : PatFrag<(ops node:$src),
Expand Down Expand Up @@ -2157,6 +2200,8 @@ multiclass VOP3P_Realtriple_gfx11_gfx12<bits<8> op>

multiclass VOP3P_Real_gfx12<bits<8> op> : VOP3P_Real_Base<GFX12Gen, op>;

multiclass VOP3P_Real_gfx1250<bits<8> op> : VOP3P_Real_Base<GFX1250Gen, op>;

multiclass VOP3P_Real_with_name_gfx12<bits<8> op,
string backing_ps_name = NAME,
string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> :
Expand All @@ -2165,6 +2210,15 @@ multiclass VOP3P_Real_with_name_gfx12<bits<8> op,
defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">;
defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">;

defm V_PK_ADD_MAX_I16 : VOP3P_Real_gfx1250<0x14>;
defm V_PK_ADD_MAX_U16 : VOP3P_Real_gfx1250<0x15>;
defm V_PK_ADD_MIN_I16 : VOP3P_Real_gfx1250<0x2d>;
defm V_PK_ADD_MIN_U16 : VOP3P_Real_gfx1250<0x2e>;
defm V_PK_MAX3_I16 : VOP3P_Real_gfx1250<0x2f>;
defm V_PK_MAX3_U16 : VOP3P_Real_gfx1250<0x30>;
defm V_PK_MIN3_I16 : VOP3P_Real_gfx1250<0x31>;
defm V_PK_MIN3_U16 : VOP3P_Real_gfx1250<0x32>;

defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>;
defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>;

Expand Down
295 changes: 295 additions & 0 deletions llvm/test/CodeGen/AMDGPU/add-max.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,295 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GISEL %s

define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: add_max_u32_vvv:
; GCN: ; %bb.0:
; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_max_u32_e32 v0, v0, v2
; GCN-NEXT: ; return to shader part epilog
%add = add i32 %a, %b
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
}

define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) {
; GCN-LABEL: add_max_u32_svv:
; GCN: ; %bb.0:
; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_max_u32_e32 v0, v0, v1
; GCN-NEXT: ; return to shader part epilog
%add = add i32 %a, %b
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
}

define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
; GCN-LABEL: add_max_u32_ssv:
; GCN: ; %bb.0:
; GCN-NEXT: s_add_co_i32 s0, s0, s1
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: v_max_u32_e32 v0, s0, v0
; GCN-NEXT: ; return to shader part epilog
%add = add i32 %a, %b
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
}

define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
; GCN-LABEL: add_max_u32_sss:
; GCN: ; %bb.0:
; GCN-NEXT: s_add_co_i32 s0, s0, s1
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GCN-NEXT: s_max_u32 s0, s0, s2
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: ; return to shader part epilog
%add = add i32 %a, %b
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
}

define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) {
; GCN-LABEL: add_max_u32_vsi:
; GCN: ; %bb.0:
; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_max_u32_e32 v0, 4, v0
; GCN-NEXT: ; return to shader part epilog
%add = add i32 %a, %b
%max = call i32 @llvm.umax.i32(i32 %add, i32 4)
%ret = bitcast i32 %max to float
ret float %ret
}

define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) {
; GCN-LABEL: add_max_u32_svl:
; GCN: ; %bb.0:
; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_max_u32_e32 v0, 0x64, v0
; GCN-NEXT: ; return to shader part epilog
%add = add i32 %a, %b
%max = call i32 @llvm.umax.i32(i32 %add, i32 100)
%ret = bitcast i32 %max to float
ret float %ret
}

define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) {
; GCN-LABEL: add_max_u32_slv:
; GCN: ; %bb.0:
; GCN-NEXT: s_addk_co_i32 s0, 0x64
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GCN-NEXT: v_max_u32_e32 v0, s0, v0
; GCN-NEXT: ; return to shader part epilog
%add = add i32 %a, 100
%max = call i32 @llvm.umax.i32(i32 %add, i32 %b)
%ret = bitcast i32 %max to float
ret float %ret
}

define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: add_max_i32_vvv:
; GCN: ; %bb.0:
; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_max_i32_e32 v0, v0, v2
; GCN-NEXT: ; return to shader part epilog
%add = add i32 %a, %b
%max = call i32 @llvm.smax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
}

define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: add_min_u32_vvv:
; GCN: ; %bb.0:
; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_min_u32_e32 v0, v0, v2
; GCN-NEXT: ; return to shader part epilog
%add = add i32 %a, %b
%max = call i32 @llvm.umin.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
}

define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: add_min_i32_vvv:
; GCN: ; %bb.0:
; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_min_i32_e32 v0, v0, v2
; GCN-NEXT: ; return to shader part epilog
%add = add i32 %a, %b
%max = call i32 @llvm.smin.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
}

define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
; GCN-LABEL: add_max_v2u16_vvv:
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_u16 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
%add = add <2 x i16> %a, %b
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
}

define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x i16> %c) {
; GCN-LABEL: add_max_v2u16_svv:
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, v1
; GCN-NEXT: ; return to shader part epilog
%add = add <2 x i16> %a, %b
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
}

define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> %c) {
; SDAG-LABEL: add_max_v2u16_ssv:
; SDAG: ; %bb.0:
; SDAG-NEXT: v_pk_add_max_u16 v0, s0, s1, v0
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: add_max_v2u16_ssv:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_lshr_b32 s2, s0, 16
; GISEL-NEXT: s_lshr_b32 s3, s1, 16
; GISEL-NEXT: s_add_co_i32 s0, s0, s1
; GISEL-NEXT: s_add_co_i32 s2, s2, s3
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GISEL-NEXT: v_pk_max_u16 v0, s0, v0
; GISEL-NEXT: ; return to shader part epilog
%add = add <2 x i16> %a, %b
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
}

define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> inreg %c) {
; SDAG-LABEL: add_max_v2u16_sss:
; SDAG: ; %bb.0:
; SDAG-NEXT: v_pk_add_u16 v0, s0, s1
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-NEXT: v_pk_max_u16 v0, v0, s2
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: add_max_v2u16_sss:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_lshr_b32 s3, s0, 16
; GISEL-NEXT: s_lshr_b32 s4, s1, 16
; GISEL-NEXT: s_add_co_i32 s0, s0, s1
; GISEL-NEXT: s_add_co_i32 s3, s3, s4
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s3
; GISEL-NEXT: s_and_b32 s3, s2, 0xffff
; GISEL-NEXT: s_lshr_b32 s1, s0, 16
; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
; GISEL-NEXT: s_lshr_b32 s2, s2, 16
; GISEL-NEXT: s_max_u32 s0, s0, s3
; GISEL-NEXT: s_max_u32 s1, s1, s2
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GISEL-NEXT: v_mov_b32_e32 v0, s0
; GISEL-NEXT: ; return to shader part epilog
%add = add <2 x i16> %a, %b
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
}

define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) {
; GCN-LABEL: add_max_v2u16_vsi:
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_u16 v0, v0, s0, 4
; GCN-NEXT: ; return to shader part epilog
%add = add <2 x i16> %a, %b
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 4, i16 0>)
%ret = bitcast <2 x i16> %max to float
ret float %ret
}

define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) {
; GCN-LABEL: add_max_v2u16_svl:
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, 0x650064
; GCN-NEXT: ; return to shader part epilog
%add = add <2 x i16> %a, %b
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 100, i16 101>)
%ret = bitcast <2 x i16> %max to float
ret float %ret
}

define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) {
; SDAG-LABEL: add_max_v2u16_slv:
; SDAG: ; %bb.0:
; SDAG-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: add_max_v2u16_slv:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_lshr_b32 s1, s0, 16
; GISEL-NEXT: s_add_co_i32 s0, s0, 0x640064
; GISEL-NEXT: s_addk_co_i32 s1, 0x64
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GISEL-NEXT: v_pk_max_u16 v0, s0, v0
; GISEL-NEXT: ; return to shader part epilog
%add = add <2 x i16> %a, <i16 100, i16 100>
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %b)
%ret = bitcast <2 x i16> %max to float
ret float %ret
}

define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
; GCN-LABEL: add_max_v2s16_vvv:
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_i16 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
%add = add <2 x i16> %a, %b
%max = call <2 x i16> @llvm.smax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
}

define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
; GCN-LABEL: add_min_v2u16_vvv:
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_min_u16 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
%add = add <2 x i16> %a, %b
%max = call <2 x i16> @llvm.umin.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
}

define amdgpu_ps float @add_min_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
; GCN-LABEL: add_min_v2s16_vvv:
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_min_i16 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
%add = add <2 x i16> %a, %b
%max = call <2 x i16> @llvm.smin.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
}

declare <2 x i16> @llvm.smin.v216(<2 x i16>, <2 x i16>)
declare <2 x i16> @llvm.smax.v216(<2 x i16>, <2 x i16>)
declare <2 x i16> @llvm.umin.v216(<2 x i16>, <2 x i16>)
declare <2 x i16> @llvm.umax.v216(<2 x i16>, <2 x i16>)
declare i32 @llvm.smin.i32(i32, i32)
declare i32 @llvm.smax.i32(i32, i32)
declare i32 @llvm.umin.i32(i32, i32)
declare i32 @llvm.umax.i32(i32, i32)
Loading