diff --git a/IGC/Compiler/CISACodeGen/CMakeLists.txt b/IGC/Compiler/CISACodeGen/CMakeLists.txt index e9fc45ff91a2..c97da6e7b680 100644 --- a/IGC/Compiler/CISACodeGen/CMakeLists.txt +++ b/IGC/Compiler/CISACodeGen/CMakeLists.txt @@ -78,6 +78,7 @@ set(IGC_BUILD__SRC__CISACodeGen_Common "${CMAKE_CURRENT_SOURCE_DIR}/VectorProcess.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/VertexShaderCodeGen.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/VertexShaderLowering.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/ComputeShaderLowering.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/WIAnalysis.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/SLMConstProp.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/POSH_RemoveNonPositionOutput.cpp" @@ -166,6 +167,7 @@ set(IGC_BUILD__HDR__CISACodeGen_Common "${CMAKE_CURRENT_SOURCE_DIR}/VectorProcess.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/VertexShaderCodeGen.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/VertexShaderLowering.hpp" + "${CMAKE_CURRENT_SOURCE_DIR}/ComputeShaderLowering.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/WIAnalysis.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/SLMConstProp.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/POSH_RemoveNonPositionOutput.h" diff --git a/IGC/Compiler/CISACodeGen/ComputeShaderCodeGen.cpp b/IGC/Compiler/CISACodeGen/ComputeShaderCodeGen.cpp index 6b2f7ec672a6..038c22d4c313 100644 --- a/IGC/Compiler/CISACodeGen/ComputeShaderCodeGen.cpp +++ b/IGC/Compiler/CISACodeGen/ComputeShaderCodeGen.cpp @@ -125,27 +125,31 @@ void CComputeShader::ParseShaderSpecificOpcode(llvm::Instruction* inst) } } -void CComputeShader::CreateThreadPayloadData(void* & pThreadPayload, uint& threadPayloadSize) +void CComputeShader::CreateThreadPayloadData(void* & pThreadPayload, uint& curbeTotalDataLength, uint& curbeReadLength) { + typedef uint16_t ThreadPayloadEntry; + // Find the max thread group dimension const OctEltUnit SIZE_OF_DQWORD = OctEltUnit(2); + const OctEltUnit SIZE_OF_OWORD = OctEltUnit(1); uint numberOfId = GetNumberOfId(); uint dimX = numLanes(m_dispatchSize); - uint dimY = (iSTD::Align(m_threadGroupSize, dimX)/dimX) * numberOfId; - - typedef uint ThreadPayloadEntry; - - uint alignedVal = EltUnit(SIZE_OF_DQWORD).Count() * sizeof(DWORD); // Oct Element is 8 DWORDS + // dimX must align to alignment_X bytes (one GRF) + uint alignment_X = EltUnit(SIZE_OF_OWORD).Count() * sizeof(DWORD); + uint dimX_aligned = iSTD::Align(dimX * sizeof(ThreadPayloadEntry), alignment_X) / sizeof(ThreadPayloadEntry); + uint dimY = (iSTD::Align(m_threadGroupSize, dimX) / dimX) * numberOfId; + curbeReadLength = dimX_aligned * numberOfId * sizeof(ThreadPayloadEntry) / alignment_X; + uint alignedVal = EltUnit(SIZE_OF_DQWORD).Count() * sizeof(ThreadPayloadEntry); // Oct Element is 8 Entries // m_NOSBufferSize is the additional space for cross-thread constant data (constants set by driver). - threadPayloadSize = iSTD::Align( dimX * dimY * sizeof( ThreadPayloadEntry ) + m_NOSBufferSize, alignedVal ); + curbeTotalDataLength = iSTD::Align(dimX_aligned * dimY * sizeof(ThreadPayloadEntry) + m_NOSBufferSize, alignedVal); assert(pThreadPayload == nullptr && "Thread payload should be a null variable"); - unsigned threadPayloadEntries = threadPayloadSize / sizeof(ThreadPayloadEntry); + unsigned threadPayloadEntries = curbeTotalDataLength / sizeof(ThreadPayloadEntry); ThreadPayloadEntry* pThreadPayloadMem = - (ThreadPayloadEntry*)IGC::aligned_malloc(threadPayloadEntries* sizeof(ThreadPayloadEntry), 16); + (ThreadPayloadEntry*)IGC::aligned_malloc(threadPayloadEntries * sizeof(ThreadPayloadEntry), 16); std::fill(pThreadPayloadMem, pThreadPayloadMem + threadPayloadEntries, 0); pThreadPayload = pThreadPayloadMem; @@ -169,17 +173,17 @@ void CComputeShader::CreateThreadPayloadData(void* & pThreadPayload, uint& threa uint lane = 0; if(m_pThread_ID_in_Group_X) { - pThreadPayloadMem[(y + lane) * dimX + x] = currThreadX; + pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadX; lane++; } if(m_pThread_ID_in_Group_Y) { - pThreadPayloadMem[(y + lane) * dimX + x] = currThreadY; + pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadY; lane++; } if(m_pThread_ID_in_Group_Z) { - pThreadPayloadMem[(y + lane) * dimX + x] = currThreadZ; + pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadZ; lane++; } @@ -259,19 +263,19 @@ CVariable* CComputeShader::CreateThreadIDinGroup(uint channelNum) case 0: if(m_pThread_ID_in_Group_X == nullptr) { - m_pThread_ID_in_Group_X = GetNewVariable(numLanes(m_SIMDSize), ISA_TYPE_D, EALIGN_GRF, false, m_numberInstance); + m_pThread_ID_in_Group_X = GetNewVariable(numLanes(m_SIMDSize), ISA_TYPE_W, EALIGN_GRF, false, m_numberInstance); } return m_pThread_ID_in_Group_X; case 1: if(m_pThread_ID_in_Group_Y == nullptr) { - m_pThread_ID_in_Group_Y = GetNewVariable(numLanes(m_SIMDSize), ISA_TYPE_D, EALIGN_GRF, false, m_numberInstance); + m_pThread_ID_in_Group_Y = GetNewVariable(numLanes(m_SIMDSize), ISA_TYPE_W, EALIGN_GRF, false, m_numberInstance); } return m_pThread_ID_in_Group_Y; case 2: if(m_pThread_ID_in_Group_Z == nullptr) { - m_pThread_ID_in_Group_Z = GetNewVariable(numLanes(m_SIMDSize), ISA_TYPE_D, EALIGN_GRF, false, m_numberInstance); + m_pThread_ID_in_Group_Z = GetNewVariable(numLanes(m_SIMDSize), ISA_TYPE_W, EALIGN_GRF, false, m_numberInstance); } return m_pThread_ID_in_Group_Z; default: @@ -335,6 +339,7 @@ void CComputeShader::AllocatePayload() { AllocateInput(m_pThread_ID_in_Group_X, offset, i); offset += m_pThread_ID_in_Group_X->GetSize(); + offset = iSTD::Round(offset, alignmentSize[m_pThread_ID_in_Group_X->GetAlign()]); } } @@ -344,6 +349,7 @@ void CComputeShader::AllocatePayload() { AllocateInput(m_pThread_ID_in_Group_Y, offset, i); offset += m_pThread_ID_in_Group_Y->GetSize(); + offset = iSTD::Round(offset, alignmentSize[m_pThread_ID_in_Group_Y->GetAlign()]); } } @@ -353,6 +359,7 @@ void CComputeShader::AllocatePayload() { AllocateInput(m_pThread_ID_in_Group_Z, offset, i); offset += m_pThread_ID_in_Group_Z->GetSize(); + offset = iSTD::Round(offset, alignmentSize[m_pThread_ID_in_Group_Z->GetAlign()]); } } @@ -466,8 +473,6 @@ void CComputeShader::FillProgram(SComputeShaderKernelProgram* pKernelProgram) pKernelProgram->FloatingPointMode = USC::GFX3DSTATE_FLOATING_POINT_IEEE_754; pKernelProgram->SingleProgramFlow = USC::GFX3DSTATE_PROGRAM_FLOW_MULTIPLE; pKernelProgram->CurbeReadOffset = 0; - pKernelProgram->CurbeReadLength = GetNumberOfId() * (numLanes(m_dispatchSize) / numLanes(SIMDMode::SIMD8)); - pKernelProgram->PhysicalThreadsInGroup = static_cast( std::ceil((static_cast(m_threadGroupSize) / static_cast((numLanes(m_dispatchSize)))))); @@ -487,7 +492,8 @@ void CComputeShader::FillProgram(SComputeShaderKernelProgram* pKernelProgram) pKernelProgram->ThreadPayloadData = nullptr; CreateThreadPayloadData( pKernelProgram->ThreadPayloadData, - pKernelProgram->CurbeTotalDataLength); + pKernelProgram->CurbeTotalDataLength, + pKernelProgram->CurbeReadLength); pKernelProgram->ThreadGroupSize = m_threadGroupSize; diff --git a/IGC/Compiler/CISACodeGen/ComputeShaderCodeGen.hpp b/IGC/Compiler/CISACodeGen/ComputeShaderCodeGen.hpp index 5b7fed9d2f6f..1e44eec37918 100644 --- a/IGC/Compiler/CISACodeGen/ComputeShaderCodeGen.hpp +++ b/IGC/Compiler/CISACodeGen/ComputeShaderCodeGen.hpp @@ -43,7 +43,7 @@ class CComputeShader : public CShader void FillProgram(SComputeShaderKernelProgram* pKernelProgram); void PreCompile() override; void ExtractGlobalVariables() override; - void CreateThreadPayloadData(void* & pThreadPayload, uint& threadPayloadSize); + void CreateThreadPayloadData(void* & pThreadPayload, uint& curbeTotalDataLength, uint& curbeReadLength); uint GetNumberOfId(); void ParseShaderSpecificOpcode(llvm::Instruction* inst) override; diff --git a/IGC/Compiler/CISACodeGen/ComputeShaderLowering.cpp b/IGC/Compiler/CISACodeGen/ComputeShaderLowering.cpp new file mode 100644 index 000000000000..5b66b08aeff7 --- /dev/null +++ b/IGC/Compiler/CISACodeGen/ComputeShaderLowering.cpp @@ -0,0 +1,116 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +#include "ComputeShaderLowering.hpp" +#include "IGCPassSupport.h" +#include "GenISAIntrinsics/GenIntrinsicInst.h" +#include "AdaptorCommon/ImplicitArgs.hpp" +#include "common/LLVMWarningsPush.hpp" +#include "llvm/IR/Function.h" +#include "common/LLVMWarningsPop.hpp" + +using namespace llvm; +using namespace IGC; +using namespace IGC::IGCMD; + +class ComputeShaderLowering : public FunctionPass +{ +public: + ComputeShaderLowering() : FunctionPass(ID) {} + virtual bool runOnFunction(Function &F) override; + virtual llvm::StringRef getPassName() const override + { + return "ComputeShaderLowering"; + } + virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override + { + AU.setPreservesCFG(); + AU.addRequired(); + } + static char ID; +protected: + Function* m_function = nullptr; + void shortenThreadID(GenIntrinsicInst& inst, Function &F); +}; + +char ComputeShaderLowering::ID = 0; + +bool ComputeShaderLowering::runOnFunction(Function &F) +{ + for(auto BI = F.begin(), BE = F.end(); BI != BE; BI++) + { + for(auto II = BI->begin(), IE = BI->end(); II != IE; II++) + { + if(GenIntrinsicInst* inst = dyn_cast(II)) + { + if(inst->getIntrinsicID() == GenISAIntrinsic::GenISA_DCL_SystemValue) + { + shortenThreadID(*inst, F); + } + } + } + } + + return true; +} + +void ComputeShaderLowering::shortenThreadID(GenIntrinsicInst& inst, Function &F) +{ + SGVUsage usage = + static_cast(llvm::cast(inst.getOperand(0))->getZExtValue()); + if (THREAD_ID_IN_GROUP_X != usage && + THREAD_ID_IN_GROUP_Y != usage && + THREAD_ID_IN_GROUP_Z != usage + ) + { + return; + } + + llvm::Module* module = F.getParent(); + IRBuilder<> builder(&inst); + llvm::Value* vSGV = builder.getInt32(usage); + llvm::Function* funcSGV = llvm::GenISAIntrinsic::getDeclaration(module, GenISAIntrinsic::GenISA_DCL_SystemValue, builder.getInt16Ty()); + llvm::Value* vSGVCreate = builder.CreateCall(funcSGV, vSGV); + vSGVCreate = builder.CreateZExtOrTrunc(vSGVCreate, builder.getInt32Ty()); + vSGVCreate = builder.CreateBitCast(vSGVCreate, inst.getType()); + inst.replaceAllUsesWith(vSGVCreate); + + return; +} + +namespace IGC { +#define PASS_FLAG "igc-compute-shader-lowering" +#define PASS_DESCRIPTION "This is the compute shader lowering pass " +#define PASS_CFG_ONLY false +#define PASS_ANALYSIS true +IGC_INITIALIZE_PASS_BEGIN(ComputeShaderLowering, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS) +IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper) +IGC_INITIALIZE_PASS_END(ComputeShaderLowering, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS) + +FunctionPass* CreateComputeShaderLowering() +{ + return new ComputeShaderLowering(); +} +} diff --git a/IGC/Compiler/CISACodeGen/ComputeShaderLowering.hpp b/IGC/Compiler/CISACodeGen/ComputeShaderLowering.hpp new file mode 100644 index 000000000000..9019cf5192f6 --- /dev/null +++ b/IGC/Compiler/CISACodeGen/ComputeShaderLowering.hpp @@ -0,0 +1,34 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +#pragma once +#include "common/LLVMWarningsPush.hpp" +#include +#include "common/LLVMWarningsPop.hpp" + +namespace IGC +{ +llvm::FunctionPass* CreateComputeShaderLowering(); +} diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp index 1dee83116308..6e7e8598c969 100644 --- a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp +++ b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp @@ -6779,18 +6779,21 @@ void EmitPass::emitCSSGV(GenIntrinsicInst* inst) } case THREAD_ID_IN_GROUP_X: { + assert(inst->getType() == Type::getInt16Ty(inst->getContext()) && "only 16bit ThreadID is supported now."); pThreadIdInGroup = csProgram->CreateThreadIDinGroup(0); m_currShader->CopyVariable(m_destination, pThreadIdInGroup); break; } case THREAD_ID_IN_GROUP_Y: { + assert(inst->getType() == Type::getInt16Ty(inst->getContext()) && "only 16bit ThreadID is supported now."); pThreadIdInGroup = csProgram->CreateThreadIDinGroup(1); m_currShader->CopyVariable(m_destination, pThreadIdInGroup); break; } case THREAD_ID_IN_GROUP_Z: { + assert(inst->getType() == Type::getInt16Ty(inst->getContext()) && "only 16bit ThreadID is supported now."); pThreadIdInGroup = csProgram->CreateThreadIDinGroup(2); m_currShader->CopyVariable(m_destination, pThreadIdInGroup); break; diff --git a/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp b/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp index c521dab55277..5f1e5c65391e 100644 --- a/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp +++ b/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp @@ -72,6 +72,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "Compiler/CISACodeGen/LowerGEPForPrivMem.hpp" #include "Compiler/CISACodeGen/POSH_RemoveNonPositionOutput.h" #include "Compiler/CISACodeGen/RegisterEstimator.hpp" +#include "Compiler/CISACodeGen/ComputeShaderLowering.hpp" #include "Compiler/CISACodeGen/SLMConstProp.hpp" #include "Compiler/Optimizer/OpenCLPasses/GenericAddressResolution/GenericAddressDynamicResolution.hpp" @@ -650,6 +651,9 @@ inline void AddLegalizationPasses(CodeGenContext &ctx, IGCPassManager& mpm) case ShaderType::DOMAIN_SHADER: mpm.add(createDomainShaderLoweringPass()); break; + case ShaderType::COMPUTE_SHADER: + mpm.add(CreateComputeShaderLowering()); + break; default: break; } diff --git a/IGC/GenISAIntrinsics/Intrinsic_definitions.py b/IGC/GenISAIntrinsics/Intrinsic_definitions.py index 271b53d2cd6d..893d355699ae 100644 --- a/IGC/GenISAIntrinsics/Intrinsic_definitions.py +++ b/IGC/GenISAIntrinsics/Intrinsic_definitions.py @@ -143,7 +143,7 @@ # (dwordAttributeOrSetupIndex, e_interpolation_PSOnly)->anyvector "GenISA_DCL_ShaderInputVec": ["anyvector",["int","int"],"NoMem"], "GenISA_DCL_GSinputVec": ["float4",["int","int"],"NoMem"], - "GenISA_DCL_SystemValue": ["anyfloat",["int"],"NoMem"], + "GenISA_DCL_SystemValue": ["any:float",["int"],"NoMem"], "GenISA_SampleOffsetX": ["float",["int"],"NoMem"], "GenISA_SampleOffsetY": ["float",["int"],"NoMem"], "GenISA_PixelPositionX": ["short",[],"NoMem"], diff --git a/visa/Common_ISA.h b/visa/Common_ISA.h index 32ec6bb960b8..aa97029aafb1 100644 --- a/visa/Common_ISA.h +++ b/visa/Common_ISA.h @@ -94,7 +94,7 @@ class G4_Declare; #define COMMON_ISA_GRF_REG_SIZE (getGRFSize()) /// # of bytes in a CISA GRF register #define COMMON_ISA_MAX_ADDRREG_WIDTH 8 -#define COMMON_ISA_MAX_FILENAME_LENGTH 255 +#define COMMON_ISA_MAX_FILENAME_LENGTH 1023 #define COMMON_ISA_MAX_KERNEL_NAME_LEN 255 #define COMMON_ISA_MAX_ADDRESS_OFFSET 4096 @@ -693,7 +693,7 @@ typedef struct _CISA_INST dst = *((type *) &buf[byte_pos]); \ byte_pos += sizeof(type); -#define STRING_LEN 512 +#define STRING_LEN 1024 struct Common_ISA_Attribute{ char* name;