Skip to content

Commit

Permalink
[AMDGPU] Fix folding clamp into pseudo scalar instructions (#100568)
Browse files Browse the repository at this point in the history
Clamp is canonically a v_max* instruction with a VGPR dst. Folding clamp
into a pseudo scalar instruction can cause issues due to a change in
regbank. We fix this with a copy.
  • Loading branch information
mbrkusanin authored Jul 25, 2024
1 parent 3295d37 commit 817cd72
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 1 deletion.
13 changes: 12 additions & 1 deletion llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1581,7 +1581,18 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {

// Clamp is applied after omod, so it is OK if omod is set.
DefClamp->setImm(1);
MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());

Register DefReg = Def->getOperand(0).getReg();
Register MIDstReg = MI.getOperand(0).getReg();
if (TRI->isSGPRReg(*MRI, DefReg)) {
// Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
// instruction with a VGPR dst.
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
MIDstReg)
.addReg(DefReg);
} else {
MRI->replaceRegWith(MIDstReg, DefReg);
}
MI.eraseFromParent();

// Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
Expand Down
26 changes: 26 additions & 0 deletions llvm/test/CodeGen/AMDGPU/si-fold-scalar-clamp.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=si-fold-operands -verify-machineinstrs -o - %s | FileCheck %s
---
name: test
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0
; CHECK-LABEL: name: test
; CHECK: liveins: $sgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; CHECK-NEXT: [[V_S_RSQ_F32_e64_:%[0-9]+]]:sgpr_32 = nofpexcept V_S_RSQ_F32_e64 0, [[COPY]], 1, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_S_RSQ_F32_e64_]]
; CHECK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[COPY1]], [[COPY1]], implicit $mode, implicit $exec
; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_F32_e32_]]
; CHECK-NEXT: S_ENDPGM 0
%0:sgpr_32 = COPY $sgpr0
%1:sgpr_32 = nofpexcept V_S_RSQ_F32_e64 0, %0, 0, 0, implicit $mode, implicit $exec
%2:vgpr_32 = nofpexcept V_MAX_F32_e64 0, %1, 0, %1, -1, 0, implicit $mode, implicit $exec
%3:vgpr_32 = nofpexcept V_ADD_F32_e32 %2:vgpr_32, %2:vgpr_32, implicit $mode, implicit $exec
$vgpr0 = COPY %3
S_ENDPGM 0
...

0 comments on commit 817cd72

Please sign in to comment.