From 44f1c6be81fb2d1e0af02387edd8e297619b32fc Mon Sep 17 00:00:00 2001 From: squidbus <175574877+squidbus@users.noreply.github.com> Date: Thu, 9 Jan 2025 23:30:16 -0800 Subject: [PATCH] shader_recompiler: Improvements to buffer addressing implementation. --- .../frontend/translate/vector_memory.cpp | 96 +++++++++------- .../ir/passes/constant_propagation_pass.cpp | 12 +- .../ir/passes/resource_tracking_pass.cpp | 103 +++++++++++------- src/shader_recompiler/specialization.h | 11 ++ src/video_core/amdgpu/resource.h | 10 ++ 5 files changed, 149 insertions(+), 83 deletions(-) diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index a5b54dff704..8fa0f3f87cc 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -164,8 +164,8 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { } void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst) { - const auto& mtbuf = inst.control.mtbuf; - const bool is_ring = mtbuf.glc && mtbuf.slc; + const auto& mubuf = inst.control.mubuf; + const bool is_ring = mubuf.glc && mubuf.slc; const IR::VectorReg vaddr{inst.src[0].code}; const IR::ScalarReg sharp{inst.src[2].code * 4}; const IR::Value soffset{GetSrc(inst.src[3])}; @@ -178,22 +178,23 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst) if (is_ring) { return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset); } - if (mtbuf.idxen && mtbuf.offen) { + if (mubuf.idxen && mubuf.offen) { return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1)); } - if (mtbuf.idxen || mtbuf.offen) { + if (mubuf.idxen || mubuf.offen) { return ir.GetVectorReg(vaddr); } return {}; }(); IR::BufferInstInfo buffer_info{}; - buffer_info.index_enable.Assign(mtbuf.idxen); - buffer_info.offset_enable.Assign(mtbuf.offen); - buffer_info.inst_offset.Assign(mtbuf.offset); - buffer_info.globally_coherent.Assign(mtbuf.glc); - buffer_info.system_coherent.Assign(mtbuf.slc); + buffer_info.index_enable.Assign(mubuf.idxen); + buffer_info.offset_enable.Assign(mubuf.offen); + buffer_info.inst_offset.Assign(mubuf.offset); + buffer_info.globally_coherent.Assign(mubuf.glc); + buffer_info.system_coherent.Assign(mubuf.slc); if (is_typed) { + const auto& mtbuf = inst.control.mtbuf; const auto dmft = static_cast(mtbuf.dfmt); const auto nfmt = static_cast(mtbuf.nfmt); ASSERT(nfmt == AmdGpu::NumberFormat::Float && @@ -220,9 +221,11 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst) { const auto& mubuf = inst.control.mubuf; const IR::VectorReg vaddr{inst.src[0].code}; const IR::ScalarReg sharp{inst.src[2].code * 4}; - ASSERT_MSG(!mubuf.offen && mubuf.offset == 0, "Offsets for image buffers are not supported"); const IR::Value address = [&] -> IR::Value { - if (mubuf.idxen) { + if (mubuf.idxen && mubuf.offen) { + return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1)); + } + if (mubuf.idxen || mubuf.offen) { return ir.GetVectorReg(vaddr); } return {}; @@ -230,13 +233,17 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst) { const IR::Value soffset{GetSrc(inst.src[3])}; ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported"); - IR::BufferInstInfo info{}; - info.index_enable.Assign(mubuf.idxen); + IR::BufferInstInfo buffer_info{}; + buffer_info.index_enable.Assign(mubuf.idxen); + buffer_info.offset_enable.Assign(mubuf.offen); + buffer_info.inst_offset.Assign(mubuf.offset); + buffer_info.globally_coherent.Assign(mubuf.glc); + buffer_info.system_coherent.Assign(mubuf.slc); const IR::Value handle = ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); - const IR::Value value = ir.LoadBufferFormat(handle, address, info); + const IR::Value value = ir.LoadBufferFormat(handle, address, buffer_info); const IR::VectorReg dst_reg{inst.src[1].code}; for (u32 i = 0; i < num_dwords; i++) { ir.SetVectorReg(dst_reg + i, IR::F32{ir.CompositeExtract(value, i)}); @@ -244,8 +251,8 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst) { } void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst) { - const auto& mtbuf = inst.control.mtbuf; - const bool is_ring = mtbuf.glc && mtbuf.slc; + const auto& mubuf = inst.control.mubuf; + const bool is_ring = mubuf.glc && mubuf.slc; const IR::VectorReg vaddr{inst.src[0].code}; const IR::ScalarReg sharp{inst.src[2].code * 4}; const IR::Value soffset{GetSrc(inst.src[3])}; @@ -259,22 +266,23 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst if (is_ring) { return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset); } - if (mtbuf.idxen && mtbuf.offen) { + if (mubuf.idxen && mubuf.offen) { return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1)); } - if (mtbuf.idxen || mtbuf.offen) { + if (mubuf.idxen || mubuf.offen) { return ir.GetVectorReg(vaddr); } return {}; }(); IR::BufferInstInfo buffer_info{}; - buffer_info.index_enable.Assign(mtbuf.idxen); - buffer_info.offset_enable.Assign(mtbuf.offen); - buffer_info.inst_offset.Assign(mtbuf.offset); - buffer_info.globally_coherent.Assign(mtbuf.glc); - buffer_info.system_coherent.Assign(mtbuf.slc); + buffer_info.index_enable.Assign(mubuf.idxen); + buffer_info.offset_enable.Assign(mubuf.offen); + buffer_info.inst_offset.Assign(mubuf.offset); + buffer_info.globally_coherent.Assign(mubuf.glc); + buffer_info.system_coherent.Assign(mubuf.slc); if (is_typed) { + const auto& mtbuf = inst.control.mtbuf; const auto dmft = static_cast(mtbuf.dfmt); const auto nfmt = static_cast(mtbuf.nfmt); ASSERT(nfmt == AmdGpu::NumberFormat::Float && @@ -321,8 +329,12 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) { const IR::Value soffset{GetSrc(inst.src[3])}; ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported"); - IR::BufferInstInfo info{}; - info.index_enable.Assign(mubuf.idxen); + IR::BufferInstInfo buffer_info{}; + buffer_info.index_enable.Assign(mubuf.idxen); + buffer_info.offset_enable.Assign(mubuf.offen); + buffer_info.inst_offset.Assign(mubuf.offset); + buffer_info.globally_coherent.Assign(mubuf.glc); + buffer_info.system_coherent.Assign(mubuf.slc); const IR::VectorReg src_reg{inst.src[1].code}; @@ -338,7 +350,7 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) { const IR::Value handle = ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); - ir.StoreBufferFormat(handle, address, value, info); + ir.StoreBufferFormat(handle, address, value, buffer_info); } void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) { @@ -358,10 +370,12 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) { const IR::U32 soffset{GetSrc(inst.src[3])}; ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported"); - IR::BufferInstInfo info{}; - info.index_enable.Assign(mubuf.idxen); - info.inst_offset.Assign(mubuf.offset); - info.offset_enable.Assign(mubuf.offen); + IR::BufferInstInfo buffer_info{}; + buffer_info.index_enable.Assign(mubuf.idxen); + buffer_info.offset_enable.Assign(mubuf.offen); + buffer_info.inst_offset.Assign(mubuf.offset); + buffer_info.globally_coherent.Assign(mubuf.glc); + buffer_info.system_coherent.Assign(mubuf.slc); IR::Value vdata_val = ir.GetVectorReg(vdata); const IR::Value handle = @@ -371,27 +385,27 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) { const IR::Value original_val = [&] { switch (op) { case AtomicOp::Swap: - return ir.BufferAtomicSwap(handle, address, vdata_val, info); + return ir.BufferAtomicSwap(handle, address, vdata_val, buffer_info); case AtomicOp::Add: - return ir.BufferAtomicIAdd(handle, address, vdata_val, info); + return ir.BufferAtomicIAdd(handle, address, vdata_val, buffer_info); case AtomicOp::Smin: - return ir.BufferAtomicIMin(handle, address, vdata_val, true, info); + return ir.BufferAtomicIMin(handle, address, vdata_val, true, buffer_info); case AtomicOp::Umin: - return ir.BufferAtomicIMin(handle, address, vdata_val, false, info); + return ir.BufferAtomicIMin(handle, address, vdata_val, false, buffer_info); case AtomicOp::Smax: - return ir.BufferAtomicIMax(handle, address, vdata_val, true, info); + return ir.BufferAtomicIMax(handle, address, vdata_val, true, buffer_info); case AtomicOp::Umax: - return ir.BufferAtomicIMax(handle, address, vdata_val, false, info); + return ir.BufferAtomicIMax(handle, address, vdata_val, false, buffer_info); case AtomicOp::And: - return ir.BufferAtomicAnd(handle, address, vdata_val, info); + return ir.BufferAtomicAnd(handle, address, vdata_val, buffer_info); case AtomicOp::Or: - return ir.BufferAtomicOr(handle, address, vdata_val, info); + return ir.BufferAtomicOr(handle, address, vdata_val, buffer_info); case AtomicOp::Xor: - return ir.BufferAtomicXor(handle, address, vdata_val, info); + return ir.BufferAtomicXor(handle, address, vdata_val, buffer_info); case AtomicOp::Inc: - return ir.BufferAtomicInc(handle, address, vdata_val, info); + return ir.BufferAtomicInc(handle, address, vdata_val, buffer_info); case AtomicOp::Dec: - return ir.BufferAtomicDec(handle, address, vdata_val, info); + return ir.BufferAtomicDec(handle, address, vdata_val, buffer_info); default: UNREACHABLE(); } diff --git a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp index fcf2f7d9fb6..26d819d8e96 100644 --- a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp +++ b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp @@ -222,9 +222,15 @@ void FoldMul(IR::Block& block, IR::Inst& inst) { return; } const IR::Value rhs{inst.Arg(1)}; - if (rhs.IsImmediate() && Arg(rhs) == 0) { - inst.ReplaceUsesWithAndRemove(IR::Value(0u)); - return; + if (rhs.IsImmediate()) { + if (Arg(rhs) == 0) { + inst.ReplaceUsesWithAndRemove(IR::Value(0u)); + return; + } + if (Arg(rhs) == 1) { + inst.ReplaceUsesWithAndRemove(inst.Arg(0)); + return; + } } } diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 10d685ed1c2..d62bd2a1ca2 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -483,55 +483,73 @@ void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descripto inst.SetArg(1, ir.Imm32(binding)); } +IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const Info& info, + const AmdGpu::Buffer& buffer, u32 stride) { + const auto inst_info = inst.Flags(); + + // index = (inst_idxen ? vgpr_index : 0) + (const_add_tid_enable ? thread_id[5:0] : 0) + IR::U32 index = ir.Imm32(0U); + if (inst_info.index_enable) { + const IR::U32 vgpr_index{inst_info.offset_enable + ? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)} + : IR::U32{inst.Arg(1)}}; + index = ir.IAdd(index, vgpr_index); + } + if (buffer.add_tid_enable) { + ASSERT_MSG(info.l_stage == LogicalStage::Compute, + "Thread ID buffer addressing is not supported outside of compute."); + const IR::U32 thread_id{ir.LaneId()}; + index = ir.IAdd(index, thread_id); + } + // offset = (inst_offen ? vgpr_offset : 0) + inst_offset + IR::U32 offset = ir.Imm32(inst_info.inst_offset.Value()); + if (inst_info.offset_enable) { + const IR::U32 vgpr_offset = inst_info.index_enable + ? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)} + : IR::U32{inst.Arg(1)}; + offset = ir.IAdd(offset, vgpr_offset); + } + const IR::U32 const_stride = ir.Imm32(stride); + IR::U32 buffer_offset; + if (buffer.swizzle_enable) { + const IR::U32 const_index_stride = ir.Imm32(buffer.GetIndexStride()); + const IR::U32 const_element_size = ir.Imm32(buffer.GetElementSize()); + // index_msb = index / const_index_stride + const IR::U32 index_msb{ir.IDiv(index, const_index_stride)}; + // index_lsb = index % const_index_stride + const IR::U32 index_lsb{ir.IMod(index, const_index_stride)}; + // offset_msb = offset / const_element_size + const IR::U32 offset_msb{ir.IDiv(offset, const_element_size)}; + // offset_lsb = offset % const_element_size + const IR::U32 offset_lsb{ir.IMod(offset, const_element_size)}; + // buffer_offset = + // (index_msb * const_stride + offset_msb * const_element_size) * const_index_stride + // + index_lsb * const_element_size + offset_lsb + const IR::U32 buffer_offset_msb = ir.IMul( + ir.IAdd(ir.IMul(index_msb, const_stride), ir.IMul(offset_msb, const_element_size)), + const_index_stride); + const IR::U32 buffer_offset_lsb = + ir.IAdd(ir.IMul(index_lsb, const_element_size), offset_lsb); + buffer_offset = ir.IAdd(buffer_offset_msb, buffer_offset_lsb); + } else { + // buffer_offset = index * const_stride + offset + buffer_offset = ir.IAdd(ir.IMul(index, const_stride), offset); + } + return buffer_offset; +} + void PatchBufferArgs(IR::Block& block, IR::Inst& inst, Info& info) { const auto handle = inst.Arg(0); const auto buffer_res = info.buffers[handle.U32()]; const auto buffer = buffer_res.GetSharp(info); - ASSERT(!buffer.add_tid_enable); - // Address of constant buffer reads can be calculated at IR emission time. if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer) { return; } IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; - const auto inst_info = inst.Flags(); - - const IR::U32 index_stride = ir.Imm32(buffer.index_stride); - const IR::U32 element_size = ir.Imm32(buffer.element_size); - - // Compute address of the buffer using the stride. - IR::U32 address = ir.Imm32(inst_info.inst_offset.Value()); - if (inst_info.index_enable) { - const IR::U32 index = inst_info.offset_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)} - : IR::U32{inst.Arg(1)}; - if (buffer.swizzle_enable) { - const IR::U32 stride_index_stride = - ir.Imm32(static_cast(buffer.stride * buffer.index_stride)); - const IR::U32 index_msb = ir.IDiv(index, index_stride); - const IR::U32 index_lsb = ir.IMod(index, index_stride); - address = ir.IAdd(address, ir.IAdd(ir.IMul(index_msb, stride_index_stride), - ir.IMul(index_lsb, element_size))); - } else { - address = ir.IAdd(address, ir.IMul(index, ir.Imm32(buffer.GetStride()))); - } - } - if (inst_info.offset_enable) { - const IR::U32 offset = inst_info.index_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)} - : IR::U32{inst.Arg(1)}; - if (buffer.swizzle_enable) { - const IR::U32 element_size_index_stride = - ir.Imm32(buffer.element_size * buffer.index_stride); - const IR::U32 offset_msb = ir.IDiv(offset, element_size); - const IR::U32 offset_lsb = ir.IMod(offset, element_size); - address = ir.IAdd(address, - ir.IAdd(ir.IMul(offset_msb, element_size_index_stride), offset_lsb)); - } else { - address = ir.IAdd(address, offset); - } - } - inst.SetArg(1, address); + inst.SetArg(1, CalculateBufferAddress(ir, inst, info, buffer, buffer.stride)); } void PatchTextureBufferArgs(IR::Block& block, IR::Inst& inst, Info& info) { @@ -539,8 +557,15 @@ void PatchTextureBufferArgs(IR::Block& block, IR::Inst& inst, Info& info) { const auto buffer_res = info.texture_buffers[handle.U32()]; const auto buffer = buffer_res.GetSharp(info); - ASSERT(!buffer.swizzle_enable && !buffer.add_tid_enable); + // Only linear addressing with index is supported currently, since we cannot yet + // address with sub-texel granularity. + const auto inst_info = inst.Flags(); + ASSERT_MSG(!buffer.swizzle_enable && !inst_info.offset_enable && inst_info.inst_offset == 0, + "Unsupported texture buffer address mode."); + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + // Stride of 1 to get an index into formatted data. See above addressing limitations. + inst.SetArg(1, CalculateBufferAddress(ir, inst, info, buffer, 1U)); if (inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32) { const auto swizzled = ApplySwizzle(ir, inst.Arg(2), buffer.DstSelect()); diff --git a/src/shader_recompiler/specialization.h b/src/shader_recompiler/specialization.h index 18b1df1f9ef..523e63497a3 100644 --- a/src/shader_recompiler/specialization.h +++ b/src/shader_recompiler/specialization.h @@ -21,10 +21,16 @@ struct VsAttribSpecialization { struct BufferSpecialization { u16 stride : 14; u16 is_storage : 1; + u16 swizzle_enable : 1; + u8 index_stride : 2 = 0; + u8 element_size : 2 = 0; u32 size = 0; bool operator==(const BufferSpecialization& other) const { return stride == other.stride && is_storage == other.is_storage && + swizzle_enable == other.swizzle_enable && + (!swizzle_enable || + (index_stride == other.index_stride && element_size == other.element_size)) && (size >= other.is_storage || is_storage); } }; @@ -101,6 +107,11 @@ struct StageSpecialization { [](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { spec.stride = sharp.GetStride(); spec.is_storage = desc.IsStorage(sharp); + spec.swizzle_enable = sharp.swizzle_enable; + if (spec.swizzle_enable) { + spec.index_stride = sharp.index_stride; + spec.element_size = sharp.element_size; + } if (!spec.is_storage) { spec.size = sharp.GetSize(); } diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index 75b8b2acf72..fa8edb3e2da 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -76,6 +76,16 @@ struct Buffer { u32 GetSize() const noexcept { return stride == 0 ? num_records : (stride * num_records); } + + u32 GetIndexStride() const noexcept { + // Index stride is 2 bits, meaning 8, 16, 32, or 64. + return 8 << index_stride; + } + + u32 GetElementSize() const noexcept { + // Element size is 2 bits, meaning 2, 4, 8, or 16. + return 2 << element_size; + } }; static_assert(sizeof(Buffer) == 16); // 128bits