From 194a181da7e2cca5f70ec0f9e65119955b3d2b47 Mon Sep 17 00:00:00 2001 From: chuang13 Date: Tue, 21 Nov 2023 13:27:04 +0800 Subject: [PATCH] Update xgl from commit 0e90e78b * Add missing Pal::CoherSampleRate flag on queue family policy mask and images used as VRS attachment * Use new cache ID for internal pipelines * Bump LLPC version to 68 * VK_EXT_primitives_generated_query-Driver Implementation * VK_EXT_frame_boundary-Expose the extension * Remove *CreateFlags typedef replace with Vk*CreateFlags2KHR after maintenance5 is released * Update PAL Version in XGL 834 * Fix performance drop observed in X-Plane with Resize Bar Enabled * Don't attempt to dump a missing binary * Expose PAL's cmdBufBatchedSubmitChainLimit setting in XGL * Fix dEQP-VK.pipeline.shader_object*.samples_1.* fail * Fix memory size for uber fetch internal data * Remove all bltmsaastate related function and parameter * Remove prebltmsaa and postbltrestoremsaa functions for cmdbuffer * Support the debug printf output in the hang state * Add Get64BitInstanceNodePtr from gpurt for compiler * Send Pal with msaastate as non-nullptr * Expose cooperative matrix * Fragment Shading Rate bug fix * Update Khronos Vulkan Headers to 1.3.269 * Fix TriangleCompressionMode setting * Fix failure in dEQP-VK.ray_tracing_pipeline.misc.* tests * Refine graphic pipeline library fast link related code * Enable RT triangle pair compression * Skip trivial task-mesh dispatch * Bump Gpurt Version to 40 * Fix dynamic rendering partial binding depth/stencil * VK_KHR_maintenance5 implementation * Add support for VK_EXT_fragment_shader_interlock * Update default navi3 RT settings, Move VGPR limit from Validate to Override and Remove Quake2 Settings * WWZ Chopped Present after Alt+Tab * Fix an invalid assertion for Pal::ShaderLibraryFunctionInfo::symbolName * Fix ANGLE test failure in dEQP-GLES31.functional.image_load_store.3d* --- cmake/XglOverrides.cmake | 1 + cmake/XglVersions.cmake | 6 +- icd/Loader/LunarG/Lnx/amd-icd.json | 4 +- icd/api/app_shader_optimizer.cpp | 3 +- icd/api/barrier_policy.cpp | 11 +- icd/api/compiler_solution.cpp | 2 + icd/api/compiler_solution_llpc.cpp | 11 +- icd/api/debug_printf.cpp | 158 ++-- icd/api/devmode/devmode_mgr.cpp | 6 +- icd/api/entry.cpp | 15 + icd/api/graphics_pipeline_common.cpp | 55 +- icd/api/include/barrier_policy.h | 4 +- icd/api/include/compiler_solution.h | 7 +- icd/api/include/debug_printf.h | 13 +- icd/api/include/graphics_pipeline_common.h | 10 +- .../khronos/sdk-1.3/vulkan/vulkan_core.h | 136 +++- icd/api/include/pipeline_compiler.h | 20 +- icd/api/include/vk_cmdbuffer.h | 11 +- icd/api/include/vk_compute_pipeline.h | 4 +- icd/api/include/vk_conv.h | 9 +- icd/api/include/vk_defines.h | 3 - icd/api/include/vk_device.h | 50 +- icd/api/include/vk_extensions.h | 3 + icd/api/include/vk_formats.h | 23 +- icd/api/include/vk_graphics_pipeline.h | 19 +- .../include/vk_graphics_pipeline_library.h | 2 +- icd/api/include/vk_physical_device.h | 9 + icd/api/include/vk_pipeline.h | 20 +- icd/api/include/vk_pipeline_layout.h | 2 + icd/api/include/vk_shader.h | 5 +- icd/api/pipeline_binary_cache.cpp | 167 ++-- icd/api/pipeline_compiler.cpp | 735 +++++++++--------- icd/api/raytrace/ray_tracing_device.cpp | 55 +- icd/api/raytrace/ray_tracing_device.h | 16 +- icd/api/raytrace/ray_tracing_util.h | 28 - icd/api/raytrace/vk_ray_tracing_pipeline.cpp | 19 +- icd/api/raytrace/vk_ray_tracing_pipeline.h | 5 +- icd/api/strings/entry_points.txt | 7 + icd/api/strings/extensions.txt | 3 + icd/api/vk_buffer.cpp | 2 +- icd/api/vk_cmdbuffer.cpp | 113 +-- icd/api/vk_cmdbuffer_transfer.cpp | 8 +- icd/api/vk_compute_pipeline.cpp | 4 +- icd/api/vk_device.cpp | 343 ++++++-- icd/api/vk_dispatch.cpp | 7 + icd/api/vk_graphics_pipeline.cpp | 114 ++- icd/api/vk_graphics_pipeline_library.cpp | 4 +- icd/api/vk_image.cpp | 6 +- icd/api/vk_image_view.cpp | 8 +- icd/api/vk_physical_device.cpp | 175 ++++- icd/api/vk_pipeline.cpp | 11 +- icd/api/vk_pipeline_layout.cpp | 17 +- icd/api/vk_query.cpp | 78 +- icd/api/vk_queue.cpp | 2 +- icd/api/vk_shader.cpp | 28 +- icd/api/vk_swapchain.cpp | 4 - icd/res/ver.h | 2 +- icd/settings/settings.cpp | 59 +- icd/settings/settings_xgl.json | 46 +- 59 files changed, 1722 insertions(+), 966 deletions(-) diff --git a/cmake/XglOverrides.cmake b/cmake/XglOverrides.cmake index 430a17d6..042315f9 100644 --- a/cmake/XglOverrides.cmake +++ b/cmake/XglOverrides.cmake @@ -91,6 +91,7 @@ endmacro() macro(xgl_overrides_pal) ### For PAL ########################################################################################################### + set(PAL_BUILD_JEMALLOC OFF CACHE BOOL "Force jemalloc off" FORCE) set(PAL_CLIENT_INTERFACE_MAJOR_VERSION ${ICD_PAL_CLIENT_MAJOR_VERSION} CACHE STRING "${PROJECT_NAME} override." FORCE) diff --git a/cmake/XglVersions.cmake b/cmake/XglVersions.cmake index 993dc386..35657268 100644 --- a/cmake/XglVersions.cmake +++ b/cmake/XglVersions.cmake @@ -28,7 +28,7 @@ include_guard() # This will become the value of PAL_CLIENT_INTERFACE_MAJOR_VERSION. It describes the version of the PAL interface # that the ICD supports. PAL uses this value to enable backwards-compatibility for older interface versions. # It must be updated on each PAL promotion after handling all of the interface changes described in palLib.h. -set(ICD_PAL_CLIENT_MAJOR_VERSION "827") +set(ICD_PAL_CLIENT_MAJOR_VERSION "834") # This will become the value of GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION if ICD_GPUOPEN_DEVMODE_BUILD=1. # It describes the interface version of the gpuopen shared module (part of PAL) that the ICD supports. @@ -37,9 +37,9 @@ set(ICD_GPUOPEN_CLIENT_MAJOR_VERSION "42") #if VKI_RAY_TRACING # This will become the value of GPURT_CLIENT_INTERFACE_MAJOR_VERSION if VKI_RAY_TRACING=1. # It describes the interface version of the GpuRT shared module that the ICD supports. -set(ICD_GPURT_CLIENT_MAJOR_VERSION "39") +set(ICD_GPURT_CLIENT_MAJOR_VERSION "40") #endif # This will become the value of LLPC_CLIENT_INTERFACE_MAJOR_VERSION if ICD_BUILD_LLPC=1. # It describes the version of the interface version of LLPC that the ICD supports. -set(ICD_LLPC_CLIENT_MAJOR_VERSION "66") +set(ICD_LLPC_CLIENT_MAJOR_VERSION "68") diff --git a/icd/Loader/LunarG/Lnx/amd-icd.json b/icd/Loader/LunarG/Lnx/amd-icd.json index 8e9a8dd2..07e518d3 100644 --- a/icd/Loader/LunarG/Lnx/amd-icd.json +++ b/icd/Loader/LunarG/Lnx/amd-icd.json @@ -2,13 +2,13 @@ "file_format_version": "1.0.0", "ICD": { "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so", - "api_version": "1.3.267" + "api_version": "1.3.269" }, "layer": { "name": "VK_LAYER_AMD_switchable_graphics_@ISABITS@", "type": "GLOBAL", "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so", - "api_version": "1.3.267", + "api_version": "1.3.269", "implementation_version": "1", "description": "AMD switchable graphics layer", "functions": { diff --git a/icd/api/app_shader_optimizer.cpp b/icd/api/app_shader_optimizer.cpp index 5652448e..739969e1 100644 --- a/icd/api/app_shader_optimizer.cpp +++ b/icd/api/app_shader_optimizer.cpp @@ -86,6 +86,8 @@ void ShaderOptimizer::CreateShaderOptimizerKey( if (settings.pipelineUseShaderHashAsProfileHash) { + VK_ASSERT(pModuleData != nullptr); + pShaderKey->codeHash.lower = Vkgc::IPipelineDumper::GetShaderHash(pModuleData); pShaderKey->codeHash.upper = 0; } @@ -249,7 +251,6 @@ void ShaderOptimizer::ApplyProfileToShaderCreateInfo( { options.pOptions->disableFMA = shaderCreate.tuningOptions.disableFMA; } - if (shaderCreate.apply.workaroundStorageImageFormats) { options.pOptions->workaroundStorageImageFormats = true; diff --git a/icd/api/barrier_policy.cpp b/icd/api/barrier_policy.cpp index fd0fb454..3fc945b4 100644 --- a/icd/api/barrier_policy.cpp +++ b/icd/api/barrier_policy.cpp @@ -705,6 +705,7 @@ void DeviceBarrierPolicy::InitQueueFamilyPolicy( | Pal::CoherClear | Pal::CoherIndirectArgs | Pal::CoherIndexData + | Pal::CoherSampleRate | Pal::CoherPresent; pPolicy->supportedLayoutUsageMask |= Pal::LayoutColorTarget | Pal::LayoutDepthStencilTarget @@ -1041,6 +1042,12 @@ void ImageBarrierPolicy::InitImageCachePolicy( supportedInputCacheMask |= Pal::CoherDepthStencilTarget | Pal::CoherClear; } + if (usage & VK_IMAGE_USAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR) + { + supportedOutputCacheMask |= Pal::CoherSampleRate; + supportedInputCacheMask |= Pal::CoherSampleRate; + } + // Apply device specific supported cache masks to limit the scope. supportedOutputCacheMask &= pDevice->GetBarrierPolicy().GetSupportedOutputCacheMask(); supportedInputCacheMask &= pDevice->GetBarrierPolicy().GetSupportedInputCacheMask(); @@ -1255,7 +1262,7 @@ uint32_t ImageBarrierPolicy::GetQueueFamilyLayoutEngineMask( // Constructor for buffer barrier policies. BufferBarrierPolicy::BufferBarrierPolicy( Device* pDevice, - BufferUsageFlagBits usage, + VkBufferUsageFlagBits2KHR usage, VkSharingMode sharingMode, uint32_t queueFamilyIndexCount, const uint32_t* pQueueFamilyIndices) @@ -1268,7 +1275,7 @@ BufferBarrierPolicy::BufferBarrierPolicy( // Initialize the cache policy of the buffer according to the input parameters. void BufferBarrierPolicy::InitBufferCachePolicy( Device* pDevice, - BufferUsageFlagBits usage, + VkBufferUsageFlagBits2KHR usage, VkSharingMode sharingMode, uint32_t queueFamilyIndexCount, const uint32_t* pQueueFamilyIndices) diff --git a/icd/api/compiler_solution.cpp b/icd/api/compiler_solution.cpp index 397fe129..8ee0c31a 100644 --- a/icd/api/compiler_solution.cpp +++ b/icd/api/compiler_solution.cpp @@ -347,6 +347,8 @@ void CompilerSolution::UpdateRayTracingFunctionNames( pTable->pFunc[Vkgc::RT_ENTRY_FETCH_HIT_TRIANGLE_FROM_NODE_POINTER]); SetRayTracingFunctionName(entryFuncTable.intrinsic.pFetchTrianglePositionFromRayQuery, pTable->pFunc[Vkgc::RT_ENTRY_FETCH_HIT_TRIANGLE_FROM_RAY_QUERY]); + SetRayTracingFunctionName(entryFuncTable.rayQuery.pGet64BitInstanceNodePtr, + pTable->pFunc[Vkgc::RT_ENTRY_GET_INSTANCE_NODE]); } } diff --git a/icd/api/compiler_solution_llpc.cpp b/icd/api/compiler_solution_llpc.cpp index 11777dd2..4ca0f430 100644 --- a/icd/api/compiler_solution_llpc.cpp +++ b/icd/api/compiler_solution_llpc.cpp @@ -609,11 +609,14 @@ VkResult CompilerSolutionLlpc::CreateGraphicsShaderBinary( m_pPhysicalDevice->Manager()->VkInstance()->FreeMem(const_cast(finalBinary.pCode)); } - if (pPipelineDumpHandle != nullptr) + if ((pPipelineDumpHandle != nullptr) && (pCreateInfo->earlyElfPackage[gplType].pCode != nullptr)) { - Vkgc::BinaryData elfBinary = ExtractPalElfBinary(pCreateInfo->earlyElfPackage[gplType]); - Vkgc::IPipelineDumper::DumpPipelineBinary( - pPipelineDumpHandle, m_gfxIp, &elfBinary); + if (pCreateInfo->earlyElfPackage[gplType].pCode != nullptr) + { + Vkgc::BinaryData elfBinary = ExtractPalElfBinary(pCreateInfo->earlyElfPackage[gplType]); + Vkgc::IPipelineDumper::DumpPipelineBinary( + pPipelineDumpHandle, m_gfxIp, &elfBinary); + } } } m_gplCacheMatrix.totalBinaries++; diff --git a/icd/api/debug_printf.cpp b/icd/api/debug_printf.cpp index ba4c5ad5..13a5c8ed 100644 --- a/icd/api/debug_printf.cpp +++ b/icd/api/debug_printf.cpp @@ -26,11 +26,14 @@ #include "vk_utils.h" #include "vk_device.h" #include "vk_cmdbuffer.h" +#include "vk_queue.h" +#include "vk_semaphore.h" +#include "palHashBaseImpl.h" #include "palPipelineAbi.h" #include "palPipelineAbiReader.h" -#include "palVectorImpl.h" -#include "palHashBaseImpl.h" +#include "palQueue.h" #include "palSysMemory.h" +#include "palVectorImpl.h" #include using namespace vk; @@ -44,7 +47,8 @@ DebugPrintf::DebugPrintf( m_pSettings(nullptr), m_parsedFormatStrings(8, pAllocator), m_frame(0), - m_pAllocator(pAllocator) + m_pAllocator(pAllocator), + m_semaphore(VK_NULL_HANDLE) { } @@ -56,7 +60,9 @@ void DebugPrintf::Reset( if ((m_state == MemoryAllocated) && (m_printfMemory.Size() > 0)) { pDevice->MemMgr()->FreeGpuMem(&m_printfMemory); + Semaphore::ObjectFromHandle(m_semaphore)->Destroy(pDevice, pDevice->VkInstance()->GetAllocCallbacks()); m_state = Enabled; + m_frame = 1; } } @@ -92,7 +98,7 @@ void DebugPrintf::BindPipeline( allocInfo.pal.size = Util::Pow2Align(settings.debugPrintfBufferSize, PAL_PAGE_BYTES); allocInfo.pal.alignment = PAL_PAGE_BYTES; allocInfo.pal.priority = Pal::GpuMemPriority::Normal; - pDevice->MemMgr()->GetCommonPool(InternalPoolDebugCpuRead, &allocInfo); + pDevice->MemMgr()->GetCommonPool(InternalPoolCpuCacheableGpuUncached, &allocInfo); VkResult result = pDevice->MemMgr()->AllocGpuMem(allocInfo, &m_printfMemory, pDevice->GetPalDeviceMask(), @@ -102,7 +108,6 @@ void DebugPrintf::BindPipeline( if (result == VK_SUCCESS) { - m_state = MemoryAllocated; m_pPipeline = pPipeline; const size_t bufferSrdSize = @@ -114,7 +119,7 @@ void DebugPrintf::BindPipeline( srdInfo.gpuAddr = m_printfMemory.GpuVirtAddr(deviceIdx); srdInfo.range = m_printfMemory.Size(); pDevice->PalDevice(deviceIdx)->CreateUntypedBufferViewSrds(1, &srdInfo, pTable); - m_frame = 0; + m_frame = 1; const Pal::uint32* pEntry = reinterpret_cast(&tableVa); pCmdBuffer->CmdSetUserData(static_cast(bindPoint), userDataOffset, 1, pEntry); @@ -128,6 +133,28 @@ void DebugPrintf::BindPipeline( pSubSections->Reserve(1); ParseFormatStringsToSubSection(it.Get()->value.printStr, pSubSections); } + constexpr VkSemaphoreTypeCreateInfo semaphoreTypeInfo + { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO, + .pNext = nullptr, + .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE, + .initialValue = 0 + }; + + VkSemaphoreCreateInfo semaphoreInfo + { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + .pNext = &semaphoreTypeInfo, + .flags = 0 + }; + + result = pDevice->CreateSemaphore( + &semaphoreInfo, pDevice->VkInstance()->GetAllocCallbacks(), &m_semaphore); + + if (result == VK_SUCCESS) + { + m_state = MemoryAllocated; + } } } } @@ -142,7 +169,6 @@ void DebugPrintf::Init( { m_state = Enabled; m_pPipeline = nullptr; - m_frame = 0; m_pSettings = &settings; m_parsedFormatStrings.Init(); @@ -157,22 +183,64 @@ void DebugPrintf::Init( // PostQueue to process executed printf buffer Pal::Result DebugPrintf::PostQueueProcess( Device* pDevice, - uint32_t deviceIdx) + uint32_t deviceIdx, + Queue* pQueue) { - if (m_state != MemoryAllocated) + Pal::Result palResult = Pal::Result::NotReady; + if (m_state == MemoryAllocated) { - return Pal::Result::NotReady; + Util::MutexAuto lock(&m_mutex); + + Semaphore* pVkSemaphore = Semaphore::ObjectFromHandle(m_semaphore); + + Pal::IQueueSemaphore* pPalSemaphore = pVkSemaphore->PalSemaphore(deviceIdx); + palResult = pQueue->PalQueue(deviceIdx)->SignalQueueSemaphore(pPalSemaphore, m_frame); + Util::File file; + PrintfString fileName = GetFileName(m_pPipeline->PalPipelineHash(), + ConvertVkPipelineType(m_pPipeline->GetType()), + m_frame, + m_pSettings->debugPrintfDumpFolder); + const char* pOutputName = m_pSettings->debugPrintfToStdout ? "-" : fileName.Data(); + Util::Result result = file.Open(pOutputName, Util::FileAccessMode::FileAccessAppend); + if (result == Pal::Result::Success) + { + uint64_t decodeOffset = 0; + uint32_t loopIndex = 0; + Pal::IQueueSemaphore* palSemaphores[] = {pPalSemaphore}; + uint64_t waitValues[] = {m_frame}; + while (true) + { + palResult = pDevice->PalDevice(DefaultDeviceIndex)->WaitForSemaphores( + 1, palSemaphores, waitValues, 0, 1000000llu); + + decodeOffset = ProcessDebugPrintfBuffer(pDevice, deviceIdx, decodeOffset, &file); + if ((PalToVkResult(palResult) <= 0) || (loopIndex++ > 1000)) + { + break; + } + } + file.Close(); + } } - Util::MutexAuto lock(&m_mutex); - pDevice->WaitIdle(); + return palResult; +} +// ===================================================================================================================== +// Process the output debug buffer and output the decoded printf strings +uint64_t DebugPrintf::ProcessDebugPrintfBuffer( + Device* pDevice, + uint32_t deviceIdx, + uint64_t decodeOffset, + Util::File* pFile) +{ void* pCpuAddr = nullptr; Pal::Result palResult = m_printfMemory.Map(deviceIdx, &pCpuAddr); - uint64_t bufferSize = 0; + uint64_t bufferSize = 0; uint32_t* pPrintBuffer = nullptr; uint32_t* pPtr = nullptr; constexpr uint32_t bufferHeaderSize = 4; uint64_t maxBufferDWSize = (m_printfMemory.Size() >> 2) - bufferHeaderSize; + if (palResult == Pal::Result::Success) { // Buffer Header is 4 dword {BufferOffset_Loword, BufferOffset_Hiword, rerv0, rerv1}; @@ -182,21 +250,21 @@ Pal::Result DebugPrintf::PostQueueProcess( pPtr += 2; bufferSize = (static_cast(bufferSizeHigh) << 32) | static_cast(bufferSizeLower); bufferSize = Util::Min(bufferSize, maxBufferDWSize); - if (bufferSize > 0) + uint64_t remainBufferSize = bufferSize - decodeOffset; + if (remainBufferSize > 1) { pPrintBuffer = static_cast(pDevice->VkInstance()->AllocMem( - bufferSize * sizeof(uint32_t), 4, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND)); + remainBufferSize * sizeof(uint32_t), 4, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND)); - memcpy(pPrintBuffer, pPtr, bufferSize * 4); + memcpy(pPrintBuffer, pPtr + decodeOffset, remainBufferSize * 4); } m_printfMemory.Unmap(deviceIdx); - if (bufferSize > 0) + if (remainBufferSize > 1) { const auto& formatStrings = *m_pPipeline->GetFormatStrings(); const uint32_t entryHeaderSize = 2; - uint64_t decodeOffset = 0; PrintfString outputBufferStr(nullptr); outputBufferStr.Reserve(10); Vector outputDecodedSpecifiers(nullptr); @@ -255,46 +323,33 @@ Pal::Result DebugPrintf::PostQueueProcess( OutputBufferString(formatString, *pSubSections, &outputBufferStr); decodeOffset += outputsInDwords; } - WriteToFile(outputBufferStr); + WriteToFile(pFile, outputBufferStr); pDevice->VkInstance()->FreeMem(pPrintBuffer); m_frame++; } } - - return palResult; + return decodeOffset; } // ===================================================================================================================== // Write the outputBuffer to the file void DebugPrintf::WriteToFile( + Util::File* pFile, const PrintfString& outputBuffer) { if (outputBuffer.size() == 0) { return; } - Util::File file; - PrintfString fileName = GetFileName(m_pPipeline->PalPipelineHash(), - ConvertVkPipelineType(m_pPipeline->GetType()), - m_frame, - m_pSettings->debugPrintfDumpFolder); - const char* pOutputName = m_pSettings->debugPrintfToStdout ? "-" : fileName.Data(); - Util::Result result = file.Open(pOutputName, Util::FileAccessMode::FileAccessAppend); + + const char *pFileBegin = "========================= Session Begin ========================\n\0"; + const char *pFileEnd = "========================== Session End =========================\n\0"; + pFile->Write(pFileBegin, strlen(pFileBegin)); + auto result = pFile->Write(outputBuffer.Data(), outputBuffer.size()); if (result == Util::Result::Success) { - const char* fileBeginPrefix = "========================= "; - const char* fileBeginPostfix =" Begin ========================\n"; - const char* fileEnd = "========================= Session End ========================\n"; - file.Write(fileBeginPrefix, strlen(fileBeginPrefix)); - file.Write(fileName.Data(), strlen(fileName.Data())); - file.Write(fileBeginPostfix, strlen(fileBeginPostfix)); - result = file.Write(outputBuffer.Data(), outputBuffer.size()); - if (result == Util::Result::Success) - { - file.Write(fileEnd, strlen(fileEnd)); - file.Flush(); - file.Close(); - } + pFile->Write(pFileEnd, strlen(pFileEnd)); + pFile->Flush(); } } @@ -315,19 +370,11 @@ PrintfString DebugPrintf::GetFileName( sprintf(fileName, "Pipeline%s_0x%016" PRIx64 "_%u", strPipelineTypes[pipelineType], pipelineHash, frameNumber); PrintfString fName(nullptr); - fName.Reserve(10); - - if (m_pSettings->debugPrintfToStdout) - { - AppendPrintfString(&fName, fileName, strlen(fileName)); - } - else - { - AppendPrintfString(&fName, pDumpFolder, strlen(pDumpFolder)); - AppendPrintfString(&fName, "/", 1); - AppendPrintfString(&fName, fileName, strlen(fileName)); - AppendPrintfString(&fName, ".txt\0", 5); - } + fName.Reserve(50); + AppendPrintfString(&fName, pDumpFolder, strlen(pDumpFolder)); + AppendPrintfString(&fName, "/", 1); + AppendPrintfString(&fName, fileName, strlen(fileName)); + AppendPrintfString(&fName, ".txt\0", 5); return fName; } @@ -435,6 +482,7 @@ void DebugPrintf::DecodeSpecifier( // static function called after QueueSubmit void DebugPrintf::PostQueueSubmit( Device* pDevice, + Queue* pQueue, VkCommandBuffer* pCmdBuffers, uint32_t cmdBufferCount) { @@ -446,7 +494,7 @@ void DebugPrintf::PostQueueSubmit( for (uint32_t j = 0; j < cmdBufferCount; ++j) { CmdBuffer* pCmdBuf = ApiCmdBuffer::ObjectFromHandle(pCmdBuffers[j]); - palResult = pCmdBuf->GetDebugPrintf()->PostQueueProcess(pDevice, deviceIdx); + palResult = pCmdBuf->GetDebugPrintf()->PostQueueProcess(pDevice, deviceIdx, pQueue); } } } diff --git a/icd/api/devmode/devmode_mgr.cpp b/icd/api/devmode/devmode_mgr.cpp index 412531bf..ca3937dc 100644 --- a/icd/api/devmode/devmode_mgr.cpp +++ b/icd/api/devmode/devmode_mgr.cpp @@ -2469,8 +2469,10 @@ void DevModeMgr::PipelineCreated( { GpuUtil::RegisterPipelineInfo pipelineInfo = { 0 }; pipelineInfo.apiPsoHash = pPipeline->GetApiHash(); - - m_trace.pGpaSession->RegisterPipeline(pPipeline->PalPipeline(DefaultDeviceIndex), pipelineInfo); + if (pPipeline->PalPipeline(DefaultDeviceIndex) != nullptr) + { + m_trace.pGpaSession->RegisterPipeline(pPipeline->PalPipeline(DefaultDeviceIndex), pipelineInfo); + } } } diff --git a/icd/api/entry.cpp b/icd/api/entry.cpp index 36d365f7..a84a301d 100644 --- a/icd/api/entry.cpp +++ b/icd/api/entry.cpp @@ -106,6 +106,21 @@ VKAPI_ATTR void VKAPI_CALL vkCmdBindIndexBuffer( indexType); } +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdBindIndexBuffer2KHR( + VkCommandBuffer cmdBuffer, + VkBuffer buffer, + VkDeviceSize offset, + VkDeviceSize size, + VkIndexType indexType) +{ + ApiCmdBuffer::ObjectFromHandle(cmdBuffer)->BindIndexBuffer( + buffer, + offset, + size, + indexType); +} + // ===================================================================================================================== VKAPI_ATTR void VKAPI_CALL vkCmdBindVertexBuffers( VkCommandBuffer cmdBuffer, diff --git a/icd/api/graphics_pipeline_common.cpp b/icd/api/graphics_pipeline_common.cpp index 39d4369b..93b6a8f1 100644 --- a/icd/api/graphics_pipeline_common.cpp +++ b/icd/api/graphics_pipeline_common.cpp @@ -687,7 +687,7 @@ uint64_t GraphicsPipelineCommon::GetDynamicStateFlags( // ===================================================================================================================== void GraphicsPipelineCommon::ExtractLibraryInfo( const VkGraphicsPipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, GraphicsPipelineLibraryInfo* pLibInfo) { @@ -798,13 +798,13 @@ VkResult GraphicsPipelineCommon::Create( Device* pDevice, PipelineCache* pPipelineCache, const VkGraphicsPipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const VkAllocationCallbacks* pAllocator, VkPipeline* pPipeline) { VkResult result; - const bool isLibrary = flags & VK_PIPELINE_CREATE_LIBRARY_BIT_KHR; + const bool isLibrary = Util::TestAnyFlagSet(flags, VK_PIPELINE_CREATE_LIBRARY_BIT_KHR); if (isLibrary) { @@ -1288,12 +1288,14 @@ static void BuildVrsRateParams( const uint64_t dynamicStateFlags, GraphicsPipelineObjectCreateInfo* pInfo) { + pInfo->flags.fragmentShadingRateEnable = 1; + + Device::SetDefaultVrsRateParams(&pInfo->immedInfo.vrsRateParams); + if (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::FragmentShadingRateStateKhr) == false) { if (pFsr != nullptr) { - pInfo->immedInfo.vrsRateParams.flags.exposeVrsPixelsMask = 1; - pInfo->immedInfo.vrsRateParams.shadingRate = VkToPalShadingSize(VkClampShadingRate(pFsr->fragmentSize, pDevice->GetMaxVrsShadingRate())); @@ -1301,24 +1303,18 @@ static void BuildVrsRateParams( static_cast(Pal::VrsCombinerStage::ProvokingVertex)] = VkToPalShadingRateCombinerOp(pFsr->combinerOps[0]); - pInfo->immedInfo.vrsRateParams.combinerState[static_cast( - Pal::VrsCombinerStage::Primitive)] = VkToPalShadingRateCombinerOp(pFsr->combinerOps[0]); - pInfo->immedInfo.vrsRateParams.combinerState[ - static_cast(Pal::VrsCombinerStage::Image)] = VkToPalShadingRateCombinerOp(pFsr->combinerOps[1]); + static_cast(Pal::VrsCombinerStage::Primitive)] = + VkToPalShadingRateCombinerOp(pFsr->combinerOps[0]); pInfo->immedInfo.vrsRateParams.combinerState[ - static_cast(Pal::VrsCombinerStage::PsIterSamples)] = Pal::VrsCombiner::Passthrough; - pInfo->flags.fragmentShadingRateEnable = 1; - + static_cast(Pal::VrsCombinerStage::Image)] = + VkToPalShadingRateCombinerOp(pFsr->combinerOps[1]); } + pInfo->staticStateMask |= 1ULL << static_cast(DynamicStatesInternal::FragmentShadingRateStateKhr); } - else - { - pInfo->flags.fragmentShadingRateEnable = 1; - } } // ===================================================================================================================== @@ -2056,12 +2052,7 @@ static void BuildExecutablePipelineState( if (pInfo->flags.force1x1ShaderRate == true) { - pInfo->immedInfo.vrsRateParams.shadingRate = Pal::VrsShadingRate::_1x1; - - for (uint32 idx = 0; idx <= static_cast(Pal::VrsCombinerStage::Image); idx++) - { - pInfo->immedInfo.vrsRateParams.combinerState[idx] = Pal::VrsCombiner::Passthrough; - } + Device::SetDefaultVrsRateParams(&pInfo->immedInfo.vrsRateParams); } if ((pInfo->immedInfo.rasterizerDiscardEnable == true) || @@ -2086,7 +2077,6 @@ static void BuildExecutablePipelineState( pInfo->flags.sampleShadingEnable = false; } -#if PAL_BUILD_GFX103 // Both MSAA and VRS would utilize the value of PS_ITER_SAMPLES // Thus, choose the min combiner (i.e. choose the higher quality rate) when both features are enabled if ((pInfo->immedInfo.msaaCreateInfo.pixelShaderSamples > 1) && @@ -2095,7 +2085,6 @@ static void BuildExecutablePipelineState( pInfo->immedInfo.vrsRateParams.combinerState[ static_cast(Pal::VrsCombinerStage::PsIterSamples)] = Pal::VrsCombiner::Min; } -#endif pInfo->flags.bindDepthStencilObject = !(IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::StencilOp) || @@ -2137,7 +2126,7 @@ static void BuildExecutablePipelineState( void GraphicsPipelineCommon::BuildPipelineObjectCreateInfo( const Device* pDevice, const VkGraphicsPipelineCreateInfo* pIn, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const PipelineOptimizerKey* pOptimizerKey, const PipelineMetadata* pBinMeta, GraphicsPipelineObjectCreateInfo* pInfo) @@ -2156,7 +2145,13 @@ void GraphicsPipelineCommon::BuildPipelineObjectCreateInfo( pInfo->activeStages = GetActiveShaderStages(pIn, &libInfo); - if (Util::TestAnyFlagSet(pInfo->activeStages, VK_SHADER_STAGE_MESH_BIT_EXT)) + VkShaderStageFlagBits preRasterStages = {}; + if (libInfo.pPreRasterizationShaderLib != nullptr) + { + preRasterStages = libInfo.pPreRasterizationShaderLib->GetPipelineObjectCreateInfo().activeStages; + } + + if (Util::TestAnyFlagSet(pInfo->activeStages | preRasterStages, VK_SHADER_STAGE_MESH_BIT_EXT)) { hasMesh = true; } @@ -2244,7 +2239,7 @@ void GraphicsPipelineCommon::BuildPipelineObjectCreateInfo( void GraphicsPipelineCommon::GeneratePipelineOptimizerKey( const Device* pDevice, const VkGraphicsPipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const GraphicsPipelineShaderStageInfo* pShaderStageInfo, ShaderOptimizerKey* pShaderKeys, PipelineOptimizerKey* pPipelineKey) @@ -2269,8 +2264,6 @@ void GraphicsPipelineCommon::GeneratePipelineOptimizerKey( const auto* pModuleData = reinterpret_cast( ShaderModule::GetFirstValidShaderData(stage.pModuleHandle)); - VK_ASSERT(pModuleData != nullptr); - pDevice->GetShaderOptimizer()->CreateShaderOptimizerKey( pModuleData, stage.codeHash, @@ -2296,8 +2289,6 @@ void GraphicsPipelineCommon::GeneratePipelineOptimizerKey( const auto* pModuleData = reinterpret_cast( ShaderModule::GetFirstValidShaderData(stage.pModuleHandle)); - VK_ASSERT(pModuleData != nullptr); - pDevice->GetShaderOptimizer()->CreateShaderOptimizerKey( pModuleData, stage.codeHash, @@ -2925,7 +2916,7 @@ bool GraphicsPipelineCommon::IsRasterizationDisabled( // - pCreateInfo->subpass void GraphicsPipelineCommon::BuildApiHash( const VkGraphicsPipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, uint64_t* pApiHash, Util::MetroHash::Hash* pElfHash) { diff --git a/icd/api/include/barrier_policy.h b/icd/api/include/barrier_policy.h index 61830d8d..ebb25b7b 100644 --- a/icd/api/include/barrier_policy.h +++ b/icd/api/include/barrier_policy.h @@ -350,7 +350,7 @@ class BufferBarrierPolicy final : public ResourceBarrierPolicy public: BufferBarrierPolicy( Device* pDevice, - BufferUsageFlagBits usage, + VkBufferUsageFlagBits2KHR usage, VkSharingMode sharingMode, uint32_t queueFamilyIndexCount, const uint32_t* pQueueFamilyIndices); @@ -364,7 +364,7 @@ class BufferBarrierPolicy final : public ResourceBarrierPolicy protected: void InitBufferCachePolicy( Device* pDevice, - BufferUsageFlagBits usage, + VkBufferUsageFlagBits2KHR usage, VkSharingMode sharingMode, uint32_t queueFamilyIndexCount, const uint32_t* pQueueFamilyIndices); diff --git a/icd/api/include/compiler_solution.h b/icd/api/include/compiler_solution.h index e8ade011..de177a4b 100644 --- a/icd/api/include/compiler_solution.h +++ b/icd/api/include/compiler_solution.h @@ -131,6 +131,7 @@ struct PipelineMetadata bool rayQueryUsed; #endif bool pointSizeUsed; + bool dualSrcBlendingUsed; bool shadingRateUsedInShader; bool enableEarlyCompile; bool enableUberFetchShader; @@ -167,7 +168,7 @@ struct GraphicsPipelineBinaryCreateInfo void* pTempBuffer; void* pMappingBuffer; size_t mappingBufferSize; - PipelineCreateFlags flags; + VkPipelineCreateFlags2KHR flags; VkFormat dbFormat; PipelineOptimizerKey* pPipelineProfileKey; PipelineCompilerType compilerType; @@ -193,7 +194,7 @@ struct ComputePipelineBinaryCreateInfo void* pTempBuffer; void* pMappingBuffer; size_t mappingBufferSize; - PipelineCreateFlags flags; + VkPipelineCreateFlags2KHR flags; const PipelineOptimizerKey* pPipelineProfileKey; PipelineCompilerType compilerType; FreeCompilerBinary freeCompilerBinary; @@ -211,7 +212,7 @@ struct RayTracingPipelineBinaryCreateInfo void* pTempBuffer; void* pMappingBuffer; size_t mappingBufferSize; - PipelineCreateFlags flags; + VkPipelineCreateFlags2KHR flags; const PipelineOptimizerKey* pPipelineProfileKey; PipelineCompilerType compilerType; FreeCompilerBinary freeCompilerBinary; diff --git a/icd/api/include/debug_printf.h b/icd/api/include/debug_printf.h index 8d5fe22b..8b537d51 100644 --- a/icd/api/include/debug_printf.h +++ b/icd/api/include/debug_printf.h @@ -113,8 +113,8 @@ class DebugPrintf uint32_t userDataOffset); void PreQueueSubmit(Device* pDevice, uint32_t deviceIdx); - Pal::Result PostQueueProcess(Device* pDevice, uint32_t deviceIdx); - static void PostQueueSubmit(Device* pDevice, VkCommandBuffer* pCmdBuffers, uint32_t cmdBufferCount); + Pal::Result PostQueueProcess(Device* pDevice, uint32_t deviceIdx, Queue* pQueue); + static void PostQueueSubmit(Device* pDevice, Queue* pQueue, VkCommandBuffer* pCmdBuffers, uint32_t cmdBufferCount); static void DecodeFormatStringsFromElf( const Device* pDevice, @@ -142,7 +142,13 @@ class DebugPrintf const PrintfSubSection& subSections, PrintfString* pOutputStr); - void WriteToFile(const PrintfString& outputBuffer); + void WriteToFile(Util::File* pFile, const PrintfString& outputBuffer); + + uint64_t ProcessDebugPrintfBuffer( + Device* pDevice, + uint32_t deviceIdx, + uint64_t decodeOffset, + Util::File* pFile); PrintfString GetFileName( uint64_t pipelineHash, @@ -158,6 +164,7 @@ class DebugPrintf uint32_t m_frame; Util::Mutex m_mutex; PalAllocator* m_pAllocator; + VkSemaphore m_semaphore; }; } #endif diff --git a/icd/api/include/graphics_pipeline_common.h b/icd/api/include/graphics_pipeline_common.h index 928b57d9..61b489c0 100644 --- a/icd/api/include/graphics_pipeline_common.h +++ b/icd/api/include/graphics_pipeline_common.h @@ -197,7 +197,7 @@ class GraphicsPipelineCommon : public Pipeline Device* pDevice, PipelineCache* pPipelineCache, const VkGraphicsPipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const VkAllocationCallbacks* pAllocator, VkPipeline* pPipeline); @@ -232,7 +232,7 @@ class GraphicsPipelineCommon : public Pipeline // Extract graphics pipeline library related info from VkGraphicsPipelineCreateInfo. static void ExtractLibraryInfo( const VkGraphicsPipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, GraphicsPipelineLibraryInfo* pLibInfo); // Check whether pipeline binary will be built @@ -251,7 +251,7 @@ class GraphicsPipelineCommon : public Pipeline static void BuildPipelineObjectCreateInfo( const Device* pDevice, const VkGraphicsPipelineCreateInfo* pIn, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const PipelineOptimizerKey* pOptimizerKey, const PipelineMetadata* pBinMeta, GraphicsPipelineObjectCreateInfo* pObjInfo); @@ -260,7 +260,7 @@ class GraphicsPipelineCommon : public Pipeline static void GeneratePipelineOptimizerKey( const Device* pDevice, const VkGraphicsPipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const GraphicsPipelineShaderStageInfo* pShaderStageInfo, ShaderOptimizerKey* pShaderKeys, PipelineOptimizerKey* pPipelineKey); @@ -268,7 +268,7 @@ class GraphicsPipelineCommon : public Pipeline // Generates the API PSO hash using the contents of the VkGraphicsPipelineCreateInfo struct static void BuildApiHash( const VkGraphicsPipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, uint64_t* pApiHash, Util::MetroHash::Hash* elfHash); diff --git a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h index 2bb2c895..904ac6fc 100644 --- a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h +++ b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h @@ -69,7 +69,7 @@ extern "C" { #define VK_API_VERSION_1_0 VK_MAKE_API_VERSION(0, 1, 0, 0)// Patch version should always be set to 0 // Version of this file -#define VK_HEADER_VERSION 267 +#define VK_HEADER_VERSION 269 // Complete version of this file #define VK_HEADER_VERSION_COMPLETE VK_MAKE_API_VERSION(0, 1, 3, VK_HEADER_VERSION) @@ -902,6 +902,11 @@ typedef enum VkStructureType { #endif VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DIAGNOSTICS_CONFIG_FEATURES_NV = 1000300000, VK_STRUCTURE_TYPE_DEVICE_DIAGNOSTICS_CONFIG_CREATE_INFO_NV = 1000300001, + VK_STRUCTURE_TYPE_CUDA_MODULE_CREATE_INFO_NV = 1000307000, + VK_STRUCTURE_TYPE_CUDA_FUNCTION_CREATE_INFO_NV = 1000307001, + VK_STRUCTURE_TYPE_CUDA_LAUNCH_INFO_NV = 1000307002, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUDA_KERNEL_LAUNCH_FEATURES_NV = 1000307003, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUDA_KERNEL_LAUNCH_PROPERTIES_NV = 1000307004, VK_STRUCTURE_TYPE_QUERY_LOW_LATENCY_SUPPORT_NV = 1000310000, VK_STRUCTURE_TYPE_EXPORT_METAL_OBJECT_CREATE_INFO_EXT = 1000311000, VK_STRUCTURE_TYPE_EXPORT_METAL_OBJECTS_INFO_EXT = 1000311001, @@ -1035,6 +1040,9 @@ typedef enum VkStructureType { VK_STRUCTURE_TYPE_SAMPLER_BORDER_COLOR_COMPONENT_MAPPING_CREATE_INFO_EXT = 1000411001, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PAGEABLE_DEVICE_LOCAL_MEMORY_FEATURES_EXT = 1000412000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CORE_PROPERTIES_ARM = 1000415000, + VK_STRUCTURE_TYPE_DEVICE_QUEUE_SHADER_CORE_CONTROL_CREATE_INFO_ARM = 1000417000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCHEDULING_CONTROLS_FEATURES_ARM = 1000417001, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCHEDULING_CONTROLS_PROPERTIES_ARM = 1000417002, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_SLICED_VIEW_OF_3D_FEATURES_EXT = 1000418000, VK_STRUCTURE_TYPE_IMAGE_VIEW_SLICED_CREATE_INFO_EXT = 1000418001, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_SET_HOST_MAPPING_FEATURES_VALVE = 1000420000, @@ -1420,6 +1428,8 @@ typedef enum VkObjectType { VK_OBJECT_TYPE_PERFORMANCE_CONFIGURATION_INTEL = 1000210000, VK_OBJECT_TYPE_DEFERRED_OPERATION_KHR = 1000268000, VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NV = 1000277000, + VK_OBJECT_TYPE_CUDA_MODULE_NV = 1000307000, + VK_OBJECT_TYPE_CUDA_FUNCTION_NV = 1000307001, VK_OBJECT_TYPE_BUFFER_COLLECTION_FUCHSIA = 1000366000, VK_OBJECT_TYPE_MICROMAP_EXT = 1000396000, VK_OBJECT_TYPE_OPTICAL_FLOW_SESSION_NV = 1000464000, @@ -10768,6 +10778,8 @@ typedef enum VkDebugReportObjectTypeEXT { VK_DEBUG_REPORT_OBJECT_TYPE_CU_FUNCTION_NVX_EXT = 1000029001, VK_DEBUG_REPORT_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR_EXT = 1000150000, VK_DEBUG_REPORT_OBJECT_TYPE_ACCELERATION_STRUCTURE_NV_EXT = 1000165000, + VK_DEBUG_REPORT_OBJECT_TYPE_CUDA_MODULE_NV = 1000307000, + VK_DEBUG_REPORT_OBJECT_TYPE_CUDA_FUNCTION_NV = 1000307001, VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_COLLECTION_FUCHSIA_EXT = 1000366000, VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_CALLBACK_EXT_EXT, VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT_EXT, @@ -14917,6 +14929,98 @@ typedef struct VkDeviceDiagnosticsConfigCreateInfoNV { #define VK_QCOM_RENDER_PASS_STORE_OPS_EXTENSION_NAME "VK_QCOM_render_pass_store_ops" +// VK_NV_cuda_kernel_launch is a preprocessor guard. Do not pass it to API calls. +#define VK_NV_cuda_kernel_launch 1 +VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkCudaModuleNV) +VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkCudaFunctionNV) +#define VK_NV_CUDA_KERNEL_LAUNCH_SPEC_VERSION 2 +#define VK_NV_CUDA_KERNEL_LAUNCH_EXTENSION_NAME "VK_NV_cuda_kernel_launch" +typedef struct VkCudaModuleCreateInfoNV { + VkStructureType sType; + const void* pNext; + size_t dataSize; + const void* pData; +} VkCudaModuleCreateInfoNV; + +typedef struct VkCudaFunctionCreateInfoNV { + VkStructureType sType; + const void* pNext; + VkCudaModuleNV module; + const char* pName; +} VkCudaFunctionCreateInfoNV; + +typedef struct VkCudaLaunchInfoNV { + VkStructureType sType; + const void* pNext; + VkCudaFunctionNV function; + uint32_t gridDimX; + uint32_t gridDimY; + uint32_t gridDimZ; + uint32_t blockDimX; + uint32_t blockDimY; + uint32_t blockDimZ; + uint32_t sharedMemBytes; + size_t paramCount; + const void* const * pParams; + size_t extraCount; + const void* const * pExtras; +} VkCudaLaunchInfoNV; + +typedef struct VkPhysicalDeviceCudaKernelLaunchFeaturesNV { + VkStructureType sType; + void* pNext; + VkBool32 cudaKernelLaunchFeatures; +} VkPhysicalDeviceCudaKernelLaunchFeaturesNV; + +typedef struct VkPhysicalDeviceCudaKernelLaunchPropertiesNV { + VkStructureType sType; + void* pNext; + uint32_t computeCapabilityMinor; + uint32_t computeCapabilityMajor; +} VkPhysicalDeviceCudaKernelLaunchPropertiesNV; + +typedef VkResult (VKAPI_PTR *PFN_vkCreateCudaModuleNV)(VkDevice device, const VkCudaModuleCreateInfoNV* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkCudaModuleNV* pModule); +typedef VkResult (VKAPI_PTR *PFN_vkGetCudaModuleCacheNV)(VkDevice device, VkCudaModuleNV module, size_t* pCacheSize, void* pCacheData); +typedef VkResult (VKAPI_PTR *PFN_vkCreateCudaFunctionNV)(VkDevice device, const VkCudaFunctionCreateInfoNV* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkCudaFunctionNV* pFunction); +typedef void (VKAPI_PTR *PFN_vkDestroyCudaModuleNV)(VkDevice device, VkCudaModuleNV module, const VkAllocationCallbacks* pAllocator); +typedef void (VKAPI_PTR *PFN_vkDestroyCudaFunctionNV)(VkDevice device, VkCudaFunctionNV function, const VkAllocationCallbacks* pAllocator); +typedef void (VKAPI_PTR *PFN_vkCmdCudaLaunchKernelNV)(VkCommandBuffer commandBuffer, const VkCudaLaunchInfoNV* pLaunchInfo); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkCreateCudaModuleNV( + VkDevice device, + const VkCudaModuleCreateInfoNV* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkCudaModuleNV* pModule); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetCudaModuleCacheNV( + VkDevice device, + VkCudaModuleNV module, + size_t* pCacheSize, + void* pCacheData); + +VKAPI_ATTR VkResult VKAPI_CALL vkCreateCudaFunctionNV( + VkDevice device, + const VkCudaFunctionCreateInfoNV* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkCudaFunctionNV* pFunction); + +VKAPI_ATTR void VKAPI_CALL vkDestroyCudaModuleNV( + VkDevice device, + VkCudaModuleNV module, + const VkAllocationCallbacks* pAllocator); + +VKAPI_ATTR void VKAPI_CALL vkDestroyCudaFunctionNV( + VkDevice device, + VkCudaFunctionNV function, + const VkAllocationCallbacks* pAllocator); + +VKAPI_ATTR void VKAPI_CALL vkCmdCudaLaunchKernelNV( + VkCommandBuffer commandBuffer, + const VkCudaLaunchInfoNV* pLaunchInfo); +#endif + + // VK_NV_low_latency is a preprocessor guard. Do not pass it to API calls. #define VK_NV_low_latency 1 #define VK_NV_LOW_LATENCY_SPEC_VERSION 1 @@ -16475,6 +16579,36 @@ typedef struct VkPhysicalDeviceShaderCorePropertiesARM { +// VK_ARM_scheduling_controls is a preprocessor guard. Do not pass it to API calls. +#define VK_ARM_scheduling_controls 1 +#define VK_ARM_SCHEDULING_CONTROLS_SPEC_VERSION 1 +#define VK_ARM_SCHEDULING_CONTROLS_EXTENSION_NAME "VK_ARM_scheduling_controls" +typedef VkFlags64 VkPhysicalDeviceSchedulingControlsFlagsARM; + +typedef enum VkPhysicalDeviceSchedulingControlsFlagBitsARM { + VK_PHYSICAL_DEVICE_SCHEDULING_CONTROLS_SHADER_CORE_COUNT_ARM = 0x00000001, + VK_PHYSICAL_DEVICE_SCHEDULING_CONTROLS_FLAG_BITS_MAX_ENUM_ARM = 0x7FFFFFFF +} VkPhysicalDeviceSchedulingControlsFlagBitsARM; +typedef struct VkDeviceQueueShaderCoreControlCreateInfoARM { + VkStructureType sType; + void* pNext; + uint32_t shaderCoreCount; +} VkDeviceQueueShaderCoreControlCreateInfoARM; + +typedef struct VkPhysicalDeviceSchedulingControlsFeaturesARM { + VkStructureType sType; + void* pNext; + VkBool32 schedulingControls; +} VkPhysicalDeviceSchedulingControlsFeaturesARM; + +typedef struct VkPhysicalDeviceSchedulingControlsPropertiesARM { + VkStructureType sType; + void* pNext; + VkPhysicalDeviceSchedulingControlsFlagsARM schedulingControlsFlags; +} VkPhysicalDeviceSchedulingControlsPropertiesARM; + + + // VK_EXT_image_sliced_view_of_3d is a preprocessor guard. Do not pass it to API calls. #define VK_EXT_image_sliced_view_of_3d 1 #define VK_EXT_IMAGE_SLICED_VIEW_OF_3D_SPEC_VERSION 1 diff --git a/icd/api/include/pipeline_compiler.h b/icd/api/include/pipeline_compiler.h index f8e985e8..1df8f41c 100644 --- a/icd/api/include/pipeline_compiler.h +++ b/icd/api/include/pipeline_compiler.h @@ -134,9 +134,9 @@ class PipelineCompiler PipelineBinaryCache* GetBinaryCache() const { return m_pBinaryCache; } void ApplyPipelineOptions( - const Device* pDevice, - VkPipelineCreateFlags flags, - Vkgc::PipelineOptions* pOptions + const Device* pDevice, + VkPipelineCreateFlags2KHR flags, + Vkgc::PipelineOptions* pOptions ); VkResult BuildShaderModule( @@ -165,7 +165,7 @@ class PipelineCompiler uint32_t deviceIndex, PipelineCache* pPipelineCache, GraphicsPipelineBinaryCreateInfo* pCreateInfo, - const PipelineCreateFlags flags, + const VkPipelineCreateFlags2KHR flags, Vkgc::BinaryData* pPipelineBinary, Util::MetroHash::Hash* pCacheId); @@ -217,7 +217,7 @@ class PipelineCompiler VkResult ConvertGraphicsPipelineInfo( const Device* pDevice, const VkGraphicsPipelineCreateInfo* pIn, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const GraphicsPipelineShaderStageInfo* pShaderInfo, const PipelineLayout* pPipelineLayout, PipelineOptimizerKey* pPipelineProfileKey, @@ -227,7 +227,7 @@ class PipelineCompiler VkResult BuildGplFastLinkCreateInfo( const Device* pDevice, const VkGraphicsPipelineCreateInfo* pIn, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const GraphicsPipelineLibraryInfo& libInfo, const PipelineLayout* pPipelineLayout, PipelineMetadata* pBinaryMetadata, @@ -240,7 +240,7 @@ class PipelineCompiler const PipelineOptimizerKey* pPipelineProfileKey, PipelineMetadata* pBinaryMetadata, ComputePipelineBinaryCreateInfo* pInfo, - PipelineCreateFlags flags); + VkPipelineCreateFlags2KHR flags); void FreeComputePipelineBinary( ComputePipelineBinaryCreateInfo* pCreateInfo, @@ -260,7 +260,7 @@ class PipelineCompiler VkResult ConvertRayTracingPipelineInfo( const Device* pDevice, const VkRayTracingPipelineCreateInfoKHR* pIn, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const RayTracingPipelineShaderStageInfo* pShaderInfo, const PipelineOptimizerKey* pPipelineProfileKey, RayTracingPipelineBinaryCreateInfo* pCreateInfo); @@ -479,8 +479,8 @@ class PipelineCompiler #endif VkResult LoadShaderModuleFromCache( - const Device* pDevice, const VkShaderModuleCreateFlags flags, + const VkShaderModuleCreateFlags internalShaderFlags, const uint32_t compilerMask, const Util::MetroHash::Hash& uniqueHash, PipelineBinaryCache* pBinaryCache, @@ -488,8 +488,8 @@ class PipelineCompiler ShaderModuleHandle* pShaderModule); void StoreShaderModuleToCache( - const Device* pDevice, const VkShaderModuleCreateFlags flags, + const VkShaderModuleCreateFlags internalShaderFlags, const uint32_t compilerMask, const Util::MetroHash::Hash& uniqueHash, PipelineBinaryCache* pBinaryCache, diff --git a/icd/api/include/vk_cmdbuffer.h b/icd/api/include/vk_cmdbuffer.h index f7972b8e..bfef652f 100644 --- a/icd/api/include/vk_cmdbuffer.h +++ b/icd/api/include/vk_cmdbuffer.h @@ -1217,10 +1217,6 @@ class CmdBuffer const Pal::ImageResolveRegion* pRegions, uint32_t deviceMask); - bool PreBltBindMsaaState(const Image& image); - - void PostBltRestoreMsaaState(bool bltMsaaState); - void PalCmdBindMsaaStates(const Pal::IMsaaState* const * pStates); inline void PalCmdBindMsaaState( @@ -2040,6 +2036,13 @@ VKAPI_ATTR void VKAPI_CALL vkCmdBindIndexBuffer( VkDeviceSize offset, VkIndexType indexType); +VKAPI_ATTR void VKAPI_CALL vkCmdBindIndexBuffer2KHR( + VkCommandBuffer commandBuffer, + VkBuffer buffer, + VkDeviceSize offset, + VkDeviceSize size, + VkIndexType indexType); + VKAPI_ATTR void VKAPI_CALL vkCmdBindVertexBuffers( VkCommandBuffer commandBuffer, uint32_t firstBinding, diff --git a/icd/api/include/vk_compute_pipeline.h b/icd/api/include/vk_compute_pipeline.h index 626581be..16f18100 100644 --- a/icd/api/include/vk_compute_pipeline.h +++ b/icd/api/include/vk_compute_pipeline.h @@ -61,7 +61,7 @@ class ComputePipeline final : public Pipeline, public NonDispatchable - static PipelineCreateFlags GetPipelineCreateFlags( + static VkPipelineCreateFlags2KHR GetPipelineCreateFlags( const CreateInfo* pCreateInfo); - static BufferUsageFlagBits GetBufferUsageFlagBits( + static VkBufferUsageFlagBits2KHR GetBufferUsageFlagBits( const VkBufferCreateInfo* pCreateInfo); + static void SetDefaultVrsRateParams( + Pal::VrsRateParams* pVrsRateParams); + protected: Device( uint32_t deviceCount, @@ -836,7 +839,6 @@ class Device void DestroyInternalPipeline(InternalPipeline* pPipeline); - VkResult CreateBltMsaaStates(); void DestroyInternalPipelines(); #if VKI_RAY_TRACING VkResult CreateRayTraceState(); @@ -878,10 +880,6 @@ class Device InternalPipeline m_accelerationStructureQueryCopyPipeline; #endif - static const uint32_t BltMsaaStateCount = 4; - - Pal::IMsaaState* m_pBltMsaaState[BltMsaaStateCount][MaxPalDevices]; - const DeviceBarrierPolicy m_barrierPolicy; // Barrier policy to use for this device const DeviceExtensions::Enabled m_enabledExtensions; // Enabled device extensions @@ -975,22 +973,6 @@ class Device }; // ===================================================================================================================== -const Pal::IMsaaState* const * Device::GetBltMsaaState( - uint32_t imgSampleCount - ) const -{ - uint32_t i = Util::Log2(imgSampleCount); - - if (i < BltMsaaStateCount) - { - return &m_pBltMsaaState[i][0]; - } - else - { - return nullptr; - } -} - VK_DEFINE_DISPATCHABLE(Device); namespace entry @@ -1178,6 +1160,11 @@ VKAPI_ATTR void VKAPI_CALL vkGetRenderAreaGranularity( VkRenderPass renderPass, VkExtent2D* pGranularity); +VKAPI_ATTR void VKAPI_CALL vkGetRenderingAreaGranularityKHR( + VkDevice device, + const VkRenderingAreaInfoKHR* pRenderingAreaInfo, + VkExtent2D* pGranularity); + VKAPI_ATTR VkResult VKAPI_CALL vkBindBufferMemory2( VkDevice device, uint32_t bindInfoCount, @@ -1414,6 +1401,17 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetDeviceFaultInfoEXT( VkDeviceFaultCountsEXT* pFaultCounts, VkDeviceFaultInfoEXT* pFaultInfo); +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkGetDeviceImageSubresourceLayoutKHR( + VkDevice device, + const VkDeviceImageSubresourceInfoKHR* pInfo, + VkSubresourceLayout2KHR* pLayout); +VKAPI_ATTR void VKAPI_CALL vkGetImageSubresourceLayout2KHR( + VkDevice device, + VkImage image, + const VkImageSubresource2KHR* pSubresource, + VkSubresourceLayout2KHR* pLayout); + } // namespace entry } // namespace vk diff --git a/icd/api/include/vk_extensions.h b/icd/api/include/vk_extensions.h index 7262e2f1..873e1190 100644 --- a/icd/api/include/vk_extensions.h +++ b/icd/api/include/vk_extensions.h @@ -273,6 +273,7 @@ class DeviceExtensions final : public Extensions #endif KHR_BIND_MEMORY2, KHR_BUFFER_DEVICE_ADDRESS, + KHR_COOPERATIVE_MATRIX, KHR_COPY_COMMANDS2, KHR_CREATE_RENDERPASS2, KHR_DEDICATED_ALLOCATION, @@ -306,6 +307,7 @@ class DeviceExtensions final : public Extensions KHR_MAINTENANCE2, KHR_MAINTENANCE3, KHR_MAINTENANCE4, + KHR_MAINTENANCE5, KHR_MAP_MEMORY2, KHR_MULTIVIEW, KHR_PIPELINE_EXECUTABLE_PROPERTIES, @@ -369,6 +371,7 @@ class DeviceExtensions final : public Extensions EXT_EXTENDED_DYNAMIC_STATE3, EXT_EXTERNAL_MEMORY_DMA_BUF, EXT_EXTERNAL_MEMORY_HOST, + EXT_FRAGMENT_SHADER_INTERLOCK, EXT_FRAME_BOUNDARY, EXT_GLOBAL_PRIORITY, EXT_GLOBAL_PRIORITY_QUERY, diff --git a/icd/api/include/vk_formats.h b/icd/api/include/vk_formats.h index 7f85d42a..5fa675b9 100755 --- a/icd/api/include/vk_formats.h +++ b/icd/api/include/vk_formats.h @@ -92,8 +92,13 @@ struct Formats #define VK_YUV_FORMAT_END VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM #define VK_YUV_IMAGE_FORMAT_COUNT (VK_YUV_FORMAT_END - VK_YUV_FORMAT_START + 1) +#define VK_MAINTENANCE5_FORMAT_START VK_FORMAT_A1B5G5R5_UNORM_PACK16 +#define VK_MAINTENANCE5_FORMAT_END VK_FORMAT_A8_UNORM_KHR +#define VK_MAINTENANCE5_IMAGE_FORMAT_COUNT (VK_MAINTENANCE5_FORMAT_END - VK_MAINTENANCE5_FORMAT_START + 1) + // Number of formats supported by the driver. -#define VK_SUPPORTED_FORMAT_COUNT (VK_FORMAT_RANGE_SIZE + VK_YUV_IMAGE_FORMAT_COUNT + VK_EXT_4444_FORMAT_COUNT) +#define VK_SUPPORTED_FORMAT_COUNT (VK_FORMAT_RANGE_SIZE + VK_YUV_IMAGE_FORMAT_COUNT + VK_EXT_4444_FORMAT_COUNT + \ + VK_MAINTENANCE5_IMAGE_FORMAT_COUNT) // ===================================================================================================================== // Get a linear index for a format (used to address tables indirectly indexed by formats). @@ -112,6 +117,11 @@ uint32_t Formats::GetIndex(VkFormat format) { return VK_FORMAT_RANGE_SIZE + VK_YUV_IMAGE_FORMAT_COUNT + (format - VK_EXT_4444_FORMAT_START); } + else if ((format >= VK_MAINTENANCE5_FORMAT_START) && (format <= VK_MAINTENANCE5_FORMAT_END)) + { + return VK_FORMAT_RANGE_SIZE + VK_YUV_IMAGE_FORMAT_COUNT + VK_EXT_4444_FORMAT_COUNT + + (format - VK_MAINTENANCE5_FORMAT_START); + } else { VK_ALERT(!"Unexpected format"); @@ -139,6 +149,14 @@ VkFormat Formats::FromIndex(uint32_t index) return static_cast(VK_EXT_4444_FORMAT_START + index - VK_FORMAT_RANGE_SIZE - VK_YUV_IMAGE_FORMAT_COUNT); } + else if ((index >= (VK_FORMAT_RANGE_SIZE + VK_YUV_IMAGE_FORMAT_COUNT + VK_EXT_4444_FORMAT_COUNT)) && + (index < (VK_FORMAT_RANGE_SIZE + VK_YUV_IMAGE_FORMAT_COUNT + VK_EXT_4444_FORMAT_COUNT + + VK_MAINTENANCE5_IMAGE_FORMAT_COUNT))) + { + return static_cast(VK_MAINTENANCE5_FORMAT_START + index - VK_FORMAT_RANGE_SIZE + - VK_YUV_IMAGE_FORMAT_COUNT + - VK_EXT_4444_FORMAT_COUNT); + } else { VK_ASSERT(!"Unexpected format index"); @@ -203,7 +221,8 @@ bool Formats::IsColorFormat(VkFormat format) return ((format >= VK_FORMAT_R4G4_UNORM_PACK8) && (format <= VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)) || ((format >= VK_FORMAT_BC1_RGB_UNORM_BLOCK) && (format <= VK_FORMAT_ASTC_12x12_SRGB_BLOCK)) || - (format == VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT) || (format == VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT); + (format == VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT) || (format == VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT) || + ((format == VK_FORMAT_A1B5G5R5_UNORM_PACK16) || (format == VK_FORMAT_A8_UNORM_KHR)); } // ===================================================================================================================== diff --git a/icd/api/include/vk_graphics_pipeline.h b/icd/api/include/vk_graphics_pipeline.h index 9743ce82..1a98dd76 100644 --- a/icd/api/include/vk_graphics_pipeline.h +++ b/icd/api/include/vk_graphics_pipeline.h @@ -143,19 +143,6 @@ static void ConvertToPalMsaaQuadSamplePattern( } } -// ===================================================================================================================== -// Force 1x1 shader rate -static void Force1x1ShaderRate( - Pal::VrsRateParams* pVrsRateParams) -{ - pVrsRateParams->shadingRate = Pal::VrsShadingRate::_1x1; - - for (uint32 idx = 0; idx <= static_cast(Pal::VrsCombinerStage::Image); idx++) - { - pVrsRateParams->combinerState[idx] = Pal::VrsCombiner::Passthrough; - } -} - // ===================================================================================================================== // Vulkan implementation of graphics pipelines created by vkCreateGraphicsPipeline class GraphicsPipeline final : public GraphicsPipelineCommon, public NonDispatchable @@ -165,7 +152,7 @@ class GraphicsPipeline final : public GraphicsPipelineCommon, public NonDispatch Device* pDevice, PipelineCache* pPipelineCache, const VkGraphicsPipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const VkAllocationCallbacks* pAllocator, VkPipeline* pPipeline); @@ -234,7 +221,7 @@ class GraphicsPipeline final : public GraphicsPipelineCommon, public NonDispatch static VkResult CreatePipelineBinaries( Device* pDevice, const VkGraphicsPipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const GraphicsPipelineShaderStageInfo* pShaderInfo, const PipelineLayout* pPipelineLayout, const Util::MetroHash::Hash* pElfHash, @@ -249,7 +236,7 @@ class GraphicsPipeline final : public GraphicsPipelineCommon, public NonDispatch static VkResult CreatePipelineObjects( Device* pDevice, const VkGraphicsPipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const VkAllocationCallbacks* pAllocator, const PipelineLayout* pPipelineLayout, const VbBindingInfo* pVbInfo, diff --git a/icd/api/include/vk_graphics_pipeline_library.h b/icd/api/include/vk_graphics_pipeline_library.h index 9dcc9a0c..2c1340e3 100644 --- a/icd/api/include/vk_graphics_pipeline_library.h +++ b/icd/api/include/vk_graphics_pipeline_library.h @@ -41,7 +41,7 @@ class GraphicsPipelineLibrary final : public GraphicsPipelineCommon, public NonD Device* pDevice, PipelineCache* pPipelineCache, const VkGraphicsPipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const VkAllocationCallbacks* pAllocator, VkPipeline* pPipeline); diff --git a/icd/api/include/vk_physical_device.h b/icd/api/include/vk_physical_device.h index e0822e1b..f4e53789 100644 --- a/icd/api/include/vk_physical_device.h +++ b/icd/api/include/vk_physical_device.h @@ -748,6 +748,10 @@ template uint32* pFragmentShadingRateCount, VkPhysicalDeviceFragmentShadingRateKHR* pFragmentShadingRates); + VkResult GetPhysicalDeviceCooperativeMatrixPropertiesKHR( + uint32_t* pPropertyCount, + VkCooperativeMatrixPropertiesKHR* pProperties); + #if defined(__unix__) #ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT VkResult AcquireXlibDisplay( @@ -1200,6 +1204,11 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceFragmentShadingRatesKHR( uint32* pFragmentShadingRateCount, VkPhysicalDeviceFragmentShadingRateKHR* pFragmentShadingRates); +VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR( + VkPhysicalDevice physicalDevice, + uint32_t* pPropertyCount, + VkCooperativeMatrixPropertiesKHR* pProperties); + } // namespace entry } // namespace vk diff --git a/icd/api/include/vk_pipeline.h b/icd/api/include/vk_pipeline.h index 1807c33e..316e4922 100644 --- a/icd/api/include/vk_pipeline.h +++ b/icd/api/include/vk_pipeline.h @@ -221,6 +221,14 @@ class Pipeline return m_pFormatStrings; } + static void GenerateHashFromSpecializationInfo( + const VkSpecializationInfo& desc, + Util::MetroHash128* pHasher); + + static void GenerateHashFromShaderStageCreateInfo( + const ShaderStageInfo& stageInfo, + Util::MetroHash128* pHasher); + static void ElfHashToCacheId( const Device* pDevice, uint32_t deviceIdx, @@ -248,16 +256,8 @@ class Pipeline const Util::MetroHash::Hash& cacheHash, uint64_t apiHash); - static PipelineCreateFlags GetCacheIdControlFlags( - PipelineCreateFlags in); - - static void GenerateHashFromSpecializationInfo( - const VkSpecializationInfo& desc, - Util::MetroHash128* pHasher); - - static void GenerateHashFromShaderStageCreateInfo( - const ShaderStageInfo& stageInfo, - Util::MetroHash128* pHasher); + static VkPipelineCreateFlags2KHR GetCacheIdControlFlags( + VkPipelineCreateFlags2KHR in); static void GenerateHashFromShaderStageCreateInfo( const VkPipelineShaderStageCreateInfo& desc, diff --git a/icd/api/include/vk_pipeline_layout.h b/icd/api/include/vk_pipeline_layout.h index 10bd1897..95d206cc 100644 --- a/icd/api/include/vk_pipeline_layout.h +++ b/icd/api/include/vk_pipeline_layout.h @@ -445,6 +445,8 @@ class PipelineLayout final : public NonDispatchable static void* GetFirstValidShaderData(const ShaderModuleHandle* pHandle); protected: - ShaderModule(size_t codeSize, const void* pCode); - VkResult Init(Device* pDevice, VkShaderModuleCreateFlags flags); + ShaderModule(size_t codeSize, const void* pCode, VkShaderModuleCreateFlags flags); + VkResult Init(Device* pDevice); size_t m_codeSize; const void* m_pCode; ShaderModuleHandle m_handle; Pal::ShaderHash m_codeHash; + VkShaderModuleCreateFlags m_flags; private: PAL_DISALLOW_COPY_AND_ASSIGN(ShaderModule); diff --git a/icd/api/pipeline_binary_cache.cpp b/icd/api/pipeline_binary_cache.cpp index 9882f54a..966d7836 100644 --- a/icd/api/pipeline_binary_cache.cpp +++ b/icd/api/pipeline_binary_cache.cpp @@ -685,109 +685,120 @@ Util::Result PipelineBinaryCache::InjectBinariesFromDirectory( if (settings.devModeElfReplacementDirectoryEnable) { - Util::File file; - char filePath[Util::PathBufferLen] = {}; - uint32_t fileCount = 0u; - const char** ppFileNames = nullptr; - size_t fileNameBufferSize = 0u; - void* pFileNameBuffer = nullptr; - size_t dirLength = strlen(settings.devModeElfReplacementDirectory) + 1u; - - Util::Hash128 pipelineHash = {}; - size_t pipelineBinarySize = 0u; - void* pPipelineBinary = nullptr; - - // Get the number of files in dir and the size of the buffer to hold their names - result = Util::ListDir( - settings.devModeElfReplacementDirectory, - &fileCount, - nullptr, - &fileNameBufferSize, - nullptr); - - if (fileCount == 0u) - { - return result; - } + size_t fileCount = 0u; + size_t fileNameBufferSize = 0u; - if (result == Util::Result::Success) + // Get the number of files in dir and the size of the buffer to hold their names. + result = Util::CountFilesInDir(settings.devModeElfReplacementDirectory, &fileCount, &fileNameBufferSize); + + if ((fileCount > 0u) && (result == Util::Result::Success)) { - // Allocate space for ppFileNames and pFileNameBuffer - ppFileNames = (const char**)AllocMem(sizeof(const char*) * fileCount); - pFileNameBuffer = AllocMem(fileNameBufferSize); - - // Populate ppFileNames and pFileNameBuffer - result = Util::ListDir( - settings.devModeElfReplacementDirectory, - &fileCount, - ppFileNames, - &fileNameBufferSize, - pFileNameBuffer); - - if (result != Util::Result::Success) + char* pFileNameBuffer = nullptr; + Util::Span> fileNames; + Util::Span fileNameBuffer; + + // Allocate space for pFileNames and pFileNameBuffer + Util::StringView* pFileNames = static_cast*>( + AllocMem(sizeof(Util::StringView) * fileCount)); + + if (pFileNames == nullptr) { - FreeMem(pFileNameBuffer); - FreeMem(ppFileNames); + result = Util::Result::ErrorOutOfMemory; } - } - if (result == Util::Result::Success) - { - // Store each file into cache - Util::Strncpy(filePath, settings.devModeElfReplacementDirectory, sizeof(filePath)); - Util::Strncat(filePath, sizeof(filePath), "\\"); - for (uint32_t fileIndex = 0; fileIndex < fileCount; fileIndex++) + if (result == Util::Result::Success) { - filePath[dirLength] = '\0'; - Util::Strncat(filePath, sizeof(filePath), ppFileNames[fileIndex]); + pFileNameBuffer = static_cast(AllocMem(fileNameBufferSize)); - ppFileNames[fileIndex] = strstr(ppFileNames[fileIndex], "_0x"); + if (pFileNameBuffer == nullptr) + { + FreeMem(pFileNames); + result = Util::Result::ErrorOutOfMemory; + } + } + + if (result == Util::Result::Success) + { + fileNames = Util::Span>(pFileNames, fileCount); + fileNameBuffer = Util::Span(pFileNameBuffer, fileNameBufferSize); + + // Populate fileNames and fileNameBuffer. + result = Util::GetFileNamesInDir(settings.devModeElfReplacementDirectory, fileNames, fileNameBuffer); - if ((ppFileNames[fileIndex] != nullptr) && - (strlen(ppFileNames[fileIndex]) >= 32)) + if (result != Util::Result::Success) { - ppFileNames[fileIndex] += 3u; - pipelineHash = ParseHash128(ppFileNames[fileIndex]); + FreeMem(pFileNameBuffer); + FreeMem(pFileNames); + } + } - if (Util::File::Exists(filePath)) + if (result == Util::Result::Success) + { + Util::File file; + char filePath[Util::PathBufferLen] = {}; + Util::Hash128 pipelineHash = {}; + size_t pipelineBinarySize = 0u; + void* pPipelineBinary = nullptr; + + // Store each file into cache + Util::Strncpy(filePath, settings.devModeElfReplacementDirectory, sizeof(filePath)); + Util::Strncat(filePath, sizeof(filePath), "\\"); + for (uint32_t fileIndex = 0; fileIndex < fileCount; fileIndex++) + { + const char* pFileName = fileNames[fileIndex].Data(); + + filePath[strlen(settings.devModeElfReplacementDirectory)] = '\0'; + + Util::Strncat(filePath, fileNames[fileIndex].Length(), pFileName); + + pFileName = strstr(pFileName, "_0x"); + + if ((pFileName != nullptr) && + (strlen(pFileName) >= 32)) { - pipelineBinarySize = Util::File::GetFileSize(filePath); - pPipelineBinary = AllocMem(pipelineBinarySize); + pFileName += 3u; + pipelineHash = ParseHash128(pFileName); - if (pPipelineBinary != nullptr) + if (Util::File::Exists(filePath)) { - if (file.Open( - filePath, - Util::FileAccessRead | Util::FileAccessBinary) == Util::Result::Success) + pipelineBinarySize = Util::File::GetFileSize(filePath); + pPipelineBinary = AllocMem(pipelineBinarySize); + + if (pPipelineBinary != nullptr) { - if (file.Read(pPipelineBinary, pipelineBinarySize, nullptr) == Util::Result::Success) + if (file.Open( + filePath, + Util::FileAccessRead | Util::FileAccessBinary) == Util::Result::Success) { - StoreReinjectionBinary(&pipelineHash, pipelineBinarySize, pPipelineBinary); + if (file.Read(pPipelineBinary, pipelineBinarySize, nullptr) == Util::Result::Success) + { + StoreReinjectionBinary(&pipelineHash, pipelineBinarySize, pPipelineBinary); + } + else + { + VK_NEVER_CALLED(); + } + + file.Close(); } else { VK_NEVER_CALLED(); } - - file.Close(); - } - else - { - VK_NEVER_CALLED(); } - } - FreeMem(pPipelineBinary); - } - else - { - VK_NEVER_CALLED(); + FreeMem(pPipelineBinary); + } + else + { + VK_NEVER_CALLED(); + } } } - } - FreeMem(pFileNameBuffer); - FreeMem(ppFileNames); + FreeMem(pFileNameBuffer); + FreeMem(pFileNames); + } } } diff --git a/icd/api/pipeline_compiler.cpp b/icd/api/pipeline_compiler.cpp index 9916f0b8..81263dd1 100644 --- a/icd/api/pipeline_compiler.cpp +++ b/icd/api/pipeline_compiler.cpp @@ -427,8 +427,8 @@ Util::MetroHash::Hash PipelineCompiler::GetShaderModuleCacheHash( // ===================================================================================================================== // Loads shader module from cache, include both run-time cache and binary cache VkResult PipelineCompiler::LoadShaderModuleFromCache( - const Device* pDevice, const VkShaderModuleCreateFlags flags, + const VkShaderModuleCreateFlags internalShaderFlags, const uint32_t compilerMask, const Util::MetroHash::Hash& uniqueHash, PipelineBinaryCache* pBinaryCache, @@ -438,6 +438,7 @@ VkResult PipelineCompiler::LoadShaderModuleFromCache( VkResult result = VK_ERROR_INITIALIZATION_FAILED; const bool supportInternalModuleCache = SupportInternalModuleCache(m_pPhysicalDevice, compilerMask); + const bool delayConversion = false; VK_ASSERT(pShaderModule->pRefCount == nullptr); @@ -483,7 +484,7 @@ VkResult PipelineCompiler::LoadShaderModuleFromCache( if ((result != VK_SUCCESS) && (cacheResult == Util::Result::Success)) { - if ((result == VK_SUCCESS) && (supportInternalModuleCache)) + if ((result == VK_SUCCESS) && supportInternalModuleCache && (delayConversion == false)) { Instance* pInstance = m_pPhysicalDevice->VkInstance(); pShaderModule->pRefCount = reinterpret_cast( @@ -526,8 +527,8 @@ VkResult PipelineCompiler::LoadShaderModuleFromCache( // ===================================================================================================================== // Stores shader module to cache, include both run-time cache and binary cache void PipelineCompiler::StoreShaderModuleToCache( - const Device* pDevice, const VkShaderModuleCreateFlags flags, + const VkShaderModuleCreateFlags internalShaderFlags, const uint32_t compilerMask, const Util::MetroHash::Hash& uniqueHash, PipelineBinaryCache* pBinaryCache, @@ -620,9 +621,7 @@ VkResult PipelineCompiler::BuildShaderModule( } result = LoadShaderModuleFromCache( - pDevice, flags, compilerMask, uniqueHash, pBinaryCache, pFeedback, pShaderModule); - - VkShaderModuleCreateFlags internalFlags = internalShaderFlags; + flags, internalShaderFlags, compilerMask, uniqueHash, pBinaryCache, pFeedback, pShaderModule); if (result != VK_SUCCESS) { @@ -631,26 +630,22 @@ VkResult PipelineCompiler::BuildShaderModule( result = m_compilerSolutionLlpc.BuildShaderModule( pDevice, flags, - internalFlags, + internalShaderFlags, finalData, adaptForFastLink, isInternal, pShaderModule, PipelineOptimizerKey{}); - } - { - StoreShaderModuleToCache(pDevice, flags, compilerMask, uniqueHash, pBinaryCache, pShaderModule); } + + StoreShaderModuleToCache(flags, internalShaderFlags, compilerMask, uniqueHash, pBinaryCache, pShaderModule); } else { - if (result == VK_SUCCESS) + if (pSettings->enablePipelineDump) { - if (pSettings->enablePipelineDump) - { - Vkgc::IPipelineDumper::DumpSpirvBinary(pSettings->pipelineDumpDir, &finalData); - } + Vkgc::IPipelineDumper::DumpSpirvBinary(pSettings->pipelineDumpDir, &finalData); } } @@ -1036,7 +1031,7 @@ VkResult PipelineCompiler::CreateGraphicsPipelineBinary( uint32_t deviceIdx, PipelineCache* pPipelineCache, GraphicsPipelineBinaryCreateInfo* pCreateInfo, - const PipelineCreateFlags flags, + const VkPipelineCreateFlags2KHR flags, Vkgc::BinaryData* pPipelineBinary, Util::MetroHash::Hash* pCacheId) { @@ -1713,6 +1708,7 @@ static void CopyPipelineShadersInfo( (libInfo.pBinaryMetadata->pFsOutputMetaData != nullptr)); pCreateInfo->pBinaryMetadata->postDepthCoverageEnable = libInfo.pBinaryMetadata->postDepthCoverageEnable; pCreateInfo->pBinaryMetadata->psOnlyPointCoordEnable = libInfo.pBinaryMetadata->psOnlyPointCoordEnable; + pCreateInfo->pBinaryMetadata->dualSrcBlendingUsed = libInfo.pBinaryMetadata->dualSrcBlendingUsed; } Vkgc::PipelineShaderInfo* pShaderInfosDst[] = @@ -2531,6 +2527,7 @@ static void BuildVertexInputInterfaceState( static void BuildPreRasterizationShaderState( const Device* pDevice, const VkGraphicsPipelineCreateInfo* pIn, + const GraphicsPipelineLibraryInfo& libInfo, const GraphicsPipelineShaderStageInfo* pShaderInfo, const uint64_t dynamicStateFlags, const VkShaderStageFlagBits activeStages, @@ -2538,10 +2535,16 @@ static void BuildPreRasterizationShaderState( { const RenderPass* pRenderPass = RenderPass::ObjectFromHandle(pIn->renderPass); bool isConservativeOverestimation = false; + bool vertexInputAbsent = + libInfo.flags.isLibrary && + (libInfo.pVertexInputInterfaceLib == nullptr) && + ((libInfo.libFlags & VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) == 0); + bool unrestrictedPrimitiveTopology = pDevice->GetEnabledFeatures().assumeDynamicTopologyInLibs || (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::PrimitiveTopology) && - pDevice->GetEnabledFeatures().dynamicPrimitiveTopologyUnrestricted); + pDevice->GetEnabledFeatures().dynamicPrimitiveTopologyUnrestricted) || + (vertexInputAbsent && pDevice->GetRuntimeSettings().useShaderLibraryForPipelineLibraryFastLink); BuildRasterizationState(pIn->pRasterizationState, dynamicStateFlags, &isConservativeOverestimation, pCreateInfo); @@ -2651,7 +2654,7 @@ static void BuildFragmentOutputInterfaceState( static void BuildExecutablePipelineState( const Device* pDevice, const VkGraphicsPipelineCreateInfo* pIn, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const GraphicsPipelineShaderStageInfo* pShaderInfo, const GraphicsPipelineLibraryInfo* pLibInfo, const PipelineLayout* pPipelineLayout, @@ -2745,7 +2748,7 @@ static void BuildExecutablePipelineState( VkResult PipelineCompiler::ConvertGraphicsPipelineInfo( const Device* pDevice, const VkGraphicsPipelineCreateInfo* pIn, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const GraphicsPipelineShaderStageInfo* pShaderInfo, const PipelineLayout* pPipelineLayout, PipelineOptimizerKey* pPipelineProfileKey, @@ -2756,37 +2759,45 @@ VkResult PipelineCompiler::ConvertGraphicsPipelineInfo( VkResult result = VK_SUCCESS; - pCreateInfo->pBinaryMetadata = pBinaryMetadata; - pCreateInfo->pPipelineProfileKey = pPipelineProfileKey; - GraphicsPipelineLibraryInfo libInfo; - GraphicsPipelineCommon::ExtractLibraryInfo(pIn, flags, &libInfo); - pCreateInfo->libFlags = libInfo.libFlags; + if (result == VK_SUCCESS) + { + pCreateInfo->pBinaryMetadata = pBinaryMetadata; + pCreateInfo->pPipelineProfileKey = pPipelineProfileKey; - pCreateInfo->libFlags |= (libInfo.pVertexInputInterfaceLib == nullptr) ? - 0 : VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT; - pCreateInfo->libFlags |= (libInfo.pPreRasterizationShaderLib == nullptr) ? - 0 : VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT; - pCreateInfo->libFlags |= (libInfo.pFragmentShaderLib == nullptr) ? - 0 : VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT; - pCreateInfo->libFlags |= (libInfo.pFragmentOutputInterfaceLib == nullptr) ? - 0 : VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT; + GraphicsPipelineCommon::ExtractLibraryInfo(pIn, flags, &libInfo); - uint32_t libFlags = libInfo.libFlags; + pCreateInfo->libFlags = libInfo.libFlags; - VkShaderStageFlagBits activeStages = GraphicsPipelineCommon::GetActiveShaderStages(pIn, &libInfo); - uint64_t dynamicStateFlags = GraphicsPipelineCommon::GetDynamicStateFlags(pIn->pDynamicState, &libInfo); + pCreateInfo->libFlags |= (libInfo.pVertexInputInterfaceLib == nullptr) ? + 0 : VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT; + pCreateInfo->libFlags |= (libInfo.pPreRasterizationShaderLib == nullptr) ? + 0 : VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT; + pCreateInfo->libFlags |= (libInfo.pFragmentShaderLib == nullptr) ? + 0 : VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT; + pCreateInfo->libFlags |= (libInfo.pFragmentOutputInterfaceLib == nullptr) ? + 0 : VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT; - libInfo.libFlags = libFlags; - pCreateInfo->flags = pIn->flags; - pDevice->GetCompiler(DefaultDeviceIndex)->ApplyPipelineOptions(pDevice, - pIn->flags, - &pCreateInfo->pipelineInfo.options - ); + uint32_t libFlags = libInfo.libFlags; + + libInfo.libFlags = libFlags; + pCreateInfo->flags = pIn->flags; + pDevice->GetCompiler(DefaultDeviceIndex)->ApplyPipelineOptions(pDevice, + pIn->flags, + &pCreateInfo->pipelineInfo.options + ); + + } + + uint64_t dynamicStateFlags = 0; if (result == VK_SUCCESS) { + VkShaderStageFlagBits activeStages = GraphicsPipelineCommon::GetActiveShaderStages(pIn, &libInfo); + + dynamicStateFlags = GraphicsPipelineCommon::GetDynamicStateFlags(pIn->pDynamicState, &libInfo); + if (libInfo.libFlags & VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) { BuildVertexInputInterfaceState(pDevice, pIn, dynamicStateFlags, activeStages, pCreateInfo); @@ -2798,7 +2809,8 @@ VkResult PipelineCompiler::ConvertGraphicsPipelineInfo( if (libInfo.libFlags & VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) { - BuildPreRasterizationShaderState(pDevice, pIn, pShaderInfo, dynamicStateFlags, activeStages, pCreateInfo); + BuildPreRasterizationShaderState( + pDevice, pIn, libInfo, pShaderInfo, dynamicStateFlags, activeStages, pCreateInfo); } else if (libInfo.pPreRasterizationShaderLib != nullptr) { @@ -2902,7 +2914,7 @@ VkResult PipelineCompiler::ConvertGraphicsPipelineInfo( VkResult PipelineCompiler::BuildGplFastLinkCreateInfo( const Device* pDevice, const VkGraphicsPipelineCreateInfo* pIn, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const GraphicsPipelineLibraryInfo& libInfo, const PipelineLayout* pPipelineLayout, PipelineMetadata* pBinaryMetadata, @@ -3037,9 +3049,9 @@ uint32_t PipelineCompiler::GetCompilerCollectionMask() // ===================================================================================================================== void PipelineCompiler::ApplyPipelineOptions( - const Device* pDevice, - VkPipelineCreateFlags flags, - Vkgc::PipelineOptions* pOptions + const Device* pDevice, + VkPipelineCreateFlags2KHR flags, + Vkgc::PipelineOptions* pOptions ) { if (pDevice->IsExtensionEnabled(DeviceExtensions::AMD_SHADER_INFO) || @@ -3107,6 +3119,10 @@ void PipelineCompiler::ApplyPipelineOptions( { pOptions->extendedRobustness.nullDescriptor = true; } + if (pDevice->GetEnabledFeatures().primitivesGeneratedQuery) + { + pOptions->enablePrimGeneratedQuery = true; + } } // ===================================================================================================================== @@ -3118,105 +3134,109 @@ VkResult PipelineCompiler::ConvertComputePipelineInfo( const PipelineOptimizerKey* pPipelineProfileKey, PipelineMetadata* pBinaryMetadata, ComputePipelineBinaryCreateInfo* pCreateInfo, - PipelineCreateFlags flags) + VkPipelineCreateFlags2KHR flags) { VkResult result = VK_SUCCESS; + auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); - auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); - - PipelineLayout* pLayout = nullptr; - - VK_ASSERT(pIn->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO); - - if (pIn->layout != VK_NULL_HANDLE) + if (result == VK_SUCCESS) { - pLayout = PipelineLayout::ObjectFromHandle(pIn->layout); - } + PipelineLayout* pLayout = nullptr; - pCreateInfo->pBinaryMetadata = pBinaryMetadata; - pCreateInfo->pPipelineProfileKey = pPipelineProfileKey; - pCreateInfo->flags = flags; - - ApplyPipelineOptions(pDevice, - flags, - &pCreateInfo->pipelineInfo.options - ); - - pCreateInfo->pipelineInfo.cs.pModuleData = - ShaderModule::GetFirstValidShaderData(pShaderInfo->stage.pModuleHandle); - - pCreateInfo->pipelineInfo.cs.pSpecializationInfo = pShaderInfo->stage.pSpecializationInfo; - pCreateInfo->pipelineInfo.cs.pEntryTarget = pShaderInfo->stage.pEntryPoint; - pCreateInfo->pipelineInfo.cs.entryStage = Vkgc::ShaderStageCompute; + if (pIn->layout != VK_NULL_HANDLE) + { + pLayout = PipelineLayout::ObjectFromHandle(pIn->layout); + } - if (pShaderInfo->stage.waveSize != 0) - { - pCreateInfo->pipelineInfo.cs.options.waveSize = pShaderInfo->stage.waveSize; - pCreateInfo->pipelineInfo.cs.options.allowVaryWaveSize = true; - } + pCreateInfo->pBinaryMetadata = pBinaryMetadata; + pCreateInfo->pPipelineProfileKey = pPipelineProfileKey; + pCreateInfo->flags = flags; - if ((pLayout != nullptr) && (pLayout->GetPipelineInfo()->mappingBufferSize > 0)) - { + ApplyPipelineOptions(pDevice, + flags, + &pCreateInfo->pipelineInfo.options + ); - size_t genericMappingBufferSize = pLayout->GetPipelineInfo()->mappingBufferSize; + pCreateInfo->pipelineInfo.cs.pModuleData = + ShaderModule::GetFirstValidShaderData(pShaderInfo->stage.pModuleHandle); - size_t tempBufferSize = genericMappingBufferSize + pCreateInfo->mappingBufferSize; - pCreateInfo->pTempBuffer = pInstance->AllocMem(tempBufferSize, - VK_DEFAULT_MEM_ALIGN, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + pCreateInfo->pipelineInfo.cs.pSpecializationInfo = pShaderInfo->stage.pSpecializationInfo; + pCreateInfo->pipelineInfo.cs.pEntryTarget = pShaderInfo->stage.pEntryPoint; + pCreateInfo->pipelineInfo.cs.entryStage = Vkgc::ShaderStageCompute; - if (pCreateInfo->pTempBuffer == nullptr) + if (pShaderInfo->stage.waveSize != 0) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; + pCreateInfo->pipelineInfo.cs.options.waveSize = pShaderInfo->stage.waveSize; + pCreateInfo->pipelineInfo.cs.options.allowVaryWaveSize = true; } - else + + if ((pLayout != nullptr) && (pLayout->GetPipelineInfo()->mappingBufferSize > 0)) { - pCreateInfo->pipelineInfo.pipelineLayoutApiHash = pLayout->GetApiHash(); - pCreateInfo->pMappingBuffer = Util::VoidPtrInc(pCreateInfo->pTempBuffer, genericMappingBufferSize); + size_t genericMappingBufferSize = pLayout->GetPipelineInfo()->mappingBufferSize; - // NOTE: Zero the allocated space that is used to create pipeline resource mappings. Some - // fields of resource mapping nodes are unused for certain node types. We must initialize - // them to zeroes. - memset(pCreateInfo->pTempBuffer, 0, tempBufferSize); + size_t tempBufferSize = genericMappingBufferSize + pCreateInfo->mappingBufferSize; + pCreateInfo->pTempBuffer = pInstance->AllocMem(tempBufferSize, + VK_DEFAULT_MEM_ALIGN, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - // Build the LLPC resource mapping description. This data contains things about how shader - // inputs like descriptor set bindings are communicated to this pipeline in a form that - // LLPC can understand. - result = pLayout->BuildLlpcPipelineMapping(Vkgc::ShaderStageComputeBit, - nullptr, - false, + if (pCreateInfo->pTempBuffer == nullptr) + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + else + { + pCreateInfo->pipelineInfo.pipelineLayoutApiHash = pLayout->GetApiHash(); + + pCreateInfo->pMappingBuffer = Util::VoidPtrInc(pCreateInfo->pTempBuffer, genericMappingBufferSize); + + // NOTE: Zero the allocated space that is used to create pipeline resource mappings. Some + // fields of resource mapping nodes are unused for certain node types. We must initialize + // them to zeroes. + memset(pCreateInfo->pTempBuffer, 0, tempBufferSize); + + // Build the LLPC resource mapping description. This data contains things about how shader + // inputs like descriptor set bindings are communicated to this pipeline in a form that + // LLPC can understand. + result = pLayout->BuildLlpcPipelineMapping(Vkgc::ShaderStageComputeBit, + nullptr, + false, #if VKI_RAY_TRACING - false, + false, #endif - pCreateInfo->pTempBuffer, - &pCreateInfo->pipelineInfo.resourceMapping, - &pCreateInfo->pipelineInfo.options.resourceLayoutScheme); + pCreateInfo->pTempBuffer, + &pCreateInfo->pipelineInfo.resourceMapping, + &pCreateInfo->pipelineInfo.options.resourceLayoutScheme); + } } } - pCreateInfo->compilerType = CheckCompilerType(&pCreateInfo->pipelineInfo); - - if (pShaderInfo->stage.pModuleHandle != nullptr) + if (result == VK_SUCCESS) { - pCreateInfo->pipelineInfo.cs.pModuleData = - ShaderModule::GetShaderData(pCreateInfo->compilerType, pShaderInfo->stage.pModuleHandle); - } + pCreateInfo->compilerType = CheckCompilerType(&pCreateInfo->pipelineInfo); + + if (pShaderInfo->stage.pModuleHandle != nullptr) + { + pCreateInfo->pipelineInfo.cs.pModuleData = + ShaderModule::GetShaderData(pCreateInfo->compilerType, pShaderInfo->stage.pModuleHandle); + } #if VKI_RAY_TRACING - auto& settings = m_pPhysicalDevice->GetRuntimeSettings(); + auto& settings = m_pPhysicalDevice->GetRuntimeSettings(); #endif #if VKI_RAY_TRACING - const auto* pModuleData = reinterpret_cast - (pCreateInfo->pipelineInfo.cs.pModuleData); + const auto* pModuleData = reinterpret_cast + (pCreateInfo->pipelineInfo.cs.pModuleData); - if ((pModuleData != nullptr) && - pModuleData->usage.enableRayQuery) - { - SetRayTracingState(pDevice, &(pCreateInfo->pipelineInfo.rtState), 0); - } + if ((pModuleData != nullptr) && + pModuleData->usage.enableRayQuery) + { + SetRayTracingState(pDevice, &(pCreateInfo->pipelineInfo.rtState), 0); + } #endif + } + if (result == VK_SUCCESS) { ApplyDefaultShaderOptions(ShaderStage::ShaderStageCompute, @@ -3369,306 +3389,306 @@ void PipelineCompiler::FreeGraphicsPipelineCreateInfo( VkResult PipelineCompiler::ConvertRayTracingPipelineInfo( const Device* pDevice, const VkRayTracingPipelineCreateInfoKHR* pIn, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const RayTracingPipelineShaderStageInfo* pShaderInfo, const PipelineOptimizerKey* pPipelineProfileKey, RayTracingPipelineBinaryCreateInfo* pCreateInfo) { - VkResult result = VK_SUCCESS; + VkResult result = VK_SUCCESS; auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); - auto& settings = m_pPhysicalDevice->GetRuntimeSettings(); - - PipelineLayout* pLayout = nullptr; - - VK_ASSERT(pIn->sType == VK_STRUCTURE_TYPE_RAY_TRACING_PIPELINE_CREATE_INFO_KHR); + auto& settings = m_pPhysicalDevice->GetRuntimeSettings(); - if (pIn->layout != VK_NULL_HANDLE) + if (result == VK_SUCCESS) { - pLayout = PipelineLayout::ObjectFromHandle(pIn->layout); - } + PipelineLayout* pLayout = nullptr; - pCreateInfo->pPipelineProfileKey = pPipelineProfileKey; - pCreateInfo->flags = flags; + if (pIn->layout != VK_NULL_HANDLE) + { + pLayout = PipelineLayout::ObjectFromHandle(pIn->layout); + } - bool hasLibraries = ((pIn->pLibraryInfo != nullptr) && (pIn->pLibraryInfo->libraryCount > 0)) && - settings.rtEnableCompilePipelineLibrary; - bool isLibrary = Util::TestAnyFlagSet(pIn->flags, VK_PIPELINE_CREATE_LIBRARY_BIT_KHR) && - settings.rtEnableCompilePipelineLibrary; - bool hasProcedural = false; + pCreateInfo->pPipelineProfileKey = pPipelineProfileKey; + pCreateInfo->flags = flags; - bool isReplay = ((pIn->groupCount > 0) && (pIn->pGroups[0].pShaderGroupCaptureReplayHandle != nullptr)); + bool hasLibraries = ((pIn->pLibraryInfo != nullptr) && (pIn->pLibraryInfo->libraryCount > 0)) && + settings.rtEnableCompilePipelineLibrary; + bool isLibrary = Util::TestAnyFlagSet(flags, VK_PIPELINE_CREATE_LIBRARY_BIT_KHR) && + settings.rtEnableCompilePipelineLibrary; + bool hasProcedural = false; - if (hasLibraries) - { - VkShaderStageFlags libraryStageMask = 0; + bool isReplay = ((pIn->groupCount > 0) && (pIn->pGroups[0].pShaderGroupCaptureReplayHandle != nullptr)); - // Visit the library shader groups - for (uint32_t libraryIdx = 0; libraryIdx < pIn->pLibraryInfo->libraryCount; ++libraryIdx) + if (hasLibraries) { - VkPipeline libraryHandle = pIn->pLibraryInfo->pLibraries[libraryIdx]; - RayTracingPipeline* pLibrary = RayTracingPipeline::ObjectFromHandle(libraryHandle); - const ShaderGroupInfo* pShaderGroupInfos = pLibrary->GetShaderGroupInfos(); + VkShaderStageFlags libraryStageMask = 0; - if (pLibrary->CheckHasTraceRay()) + // Visit the library shader groups + for (uint32_t libraryIdx = 0; libraryIdx < pIn->pLibraryInfo->libraryCount; ++libraryIdx) { - libraryStageMask |= VK_SHADER_STAGE_COMPUTE_BIT; - } + VkPipeline libraryHandle = pIn->pLibraryInfo->pLibraries[libraryIdx]; + RayTracingPipeline* pLibrary = RayTracingPipeline::ObjectFromHandle(libraryHandle); + const ShaderGroupInfo* pShaderGroupInfos = pLibrary->GetShaderGroupInfos(); - for (uint32_t groupIdx = 0; groupIdx < pLibrary->GetShaderGroupCount(); groupIdx++) - { - libraryStageMask |= pShaderGroupInfos[groupIdx].stages; + if (pLibrary->CheckHasTraceRay()) + { + libraryStageMask |= VK_SHADER_STAGE_COMPUTE_BIT; + } - if (pShaderGroupInfos[groupIdx].type == - VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR) + for (uint32_t groupIdx = 0; groupIdx < pLibrary->GetShaderGroupCount(); groupIdx++) { - hasProcedural = true; + libraryStageMask |= pShaderGroupInfos[groupIdx].stages; + + if (pShaderGroupInfos[groupIdx].type == + VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR) + { + hasProcedural = true; + } } } - } - pCreateInfo->pipelineInfo.pipelineLibStageMask = VkToVkgcShaderStageMask(libraryStageMask); - } + pCreateInfo->pipelineInfo.pipelineLibStageMask = VkToVkgcShaderStageMask(libraryStageMask); + } - // Implicitly include the SKIP_AABBS pipeline flag if there are no procedural - // shader groups. This should be common for triangle-only setups and will - // simplify the traversal routine. Note this guarantee cannot be made for - // pipeline libraries - if (settings.rtAutoSkipAabbIntersections && (isLibrary == false)) - { - for (uint32_t groupIdx = 0; groupIdx < pIn->groupCount; groupIdx++) + // Implicitly include the SKIP_AABBS pipeline flag if there are no procedural + // shader groups. This should be common for triangle-only setups and will + // simplify the traversal routine. Note this guarantee cannot be made for + // pipeline libraries + if (settings.rtAutoSkipAabbIntersections && (isLibrary == false)) { - if (pIn->pGroups[groupIdx].type == - VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR) + for (uint32_t groupIdx = 0; groupIdx < pIn->groupCount; groupIdx++) { - hasProcedural = true; + if (pIn->pGroups[groupIdx].type == + VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR) + { + hasProcedural = true; - break; + break; + } } - } - if (hasProcedural == false) - { - pCreateInfo->flags |= VK_PIPELINE_CREATE_RAY_TRACING_SKIP_AABBS_BIT_KHR; + if (hasProcedural == false) + { + pCreateInfo->flags |= VK_PIPELINE_CREATE_RAY_TRACING_SKIP_AABBS_BIT_KHR; + } } - } - ApplyPipelineOptions(pDevice, flags, &pCreateInfo->pipelineInfo.options - ); + ApplyPipelineOptions(pDevice, flags, &pCreateInfo->pipelineInfo.options + ); - pCreateInfo->pipelineInfo.options.disableImageResourceCheck = settings.disableRayTracingImageResourceTypeCheck; + pCreateInfo->pipelineInfo.options.disableImageResourceCheck = settings.disableRayTracingImageResourceTypeCheck; - pCreateInfo->pipelineInfo.maxRecursionDepth = pIn->maxPipelineRayRecursionDepth; - pCreateInfo->pipelineInfo.indirectStageMask = settings.rtIndirectStageMask; - static_assert(RaytracingNone == static_cast(Vkgc::LlpcRaytracingMode::None)); - static_assert(RaytracingLegacy == static_cast(Vkgc::LlpcRaytracingMode::Legacy)); + pCreateInfo->pipelineInfo.maxRecursionDepth = pIn->maxPipelineRayRecursionDepth; + pCreateInfo->pipelineInfo.indirectStageMask = settings.rtIndirectStageMask; + static_assert(RaytracingNone == static_cast(Vkgc::LlpcRaytracingMode::None)); + static_assert(RaytracingLegacy == static_cast(Vkgc::LlpcRaytracingMode::Legacy)); #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 69 - static_assert(RaytracingContinufy == static_cast(Vkgc::LlpcRaytracingMode::Gpurt2)); + static_assert(RaytracingContinufy == static_cast(Vkgc::LlpcRaytracingMode::Gpurt2)); #else - static_assert(RaytracingContinufy == static_cast(Vkgc::LlpcRaytracingMode::Continufy)); + static_assert(RaytracingContinufy == static_cast(Vkgc::LlpcRaytracingMode::Continufy)); #endif - static_assert(RaytracingContinuations == static_cast(Vkgc::LlpcRaytracingMode::Continuations)); - pCreateInfo->pipelineInfo.mode = static_cast(settings.llpcRaytracingMode); + static_assert(RaytracingContinuations == static_cast(Vkgc::LlpcRaytracingMode::Continuations)); + pCreateInfo->pipelineInfo.mode = static_cast(settings.llpcRaytracingMode); - pCreateInfo->pipelineInfo.isReplay = isReplay; + pCreateInfo->pipelineInfo.isReplay = isReplay; - // pLibraryInterface must be populated (per spec) if the pipeline is a library or has libraries - VK_ASSERT((pIn->pLibraryInterface != nullptr) || ((isLibrary || hasLibraries) == false)); + // pLibraryInterface must be populated (per spec) if the pipeline is a library or has libraries + VK_ASSERT((pIn->pLibraryInterface != nullptr) || ((isLibrary || hasLibraries) == false)); - if (isLibrary || hasLibraries) - { - // When pipeline libraries are involved maxPayloadSize and maxAttributeSize are read from - pCreateInfo->pipelineInfo.payloadSizeMaxInLib = pIn->pLibraryInterface->maxPipelineRayPayloadSize; - pCreateInfo->pipelineInfo.attributeSizeMaxInLib = pIn->pLibraryInterface->maxPipelineRayHitAttributeSize; - } - - if (hasLibraries) - { - // pipeline library, or pipeline that contains pipeline library(s) - pCreateInfo->pipelineInfo.hasPipelineLibrary = true; - } - else - { - pCreateInfo->pipelineInfo.hasPipelineLibrary = false; - } + if (isLibrary || hasLibraries) + { + // When pipeline libraries are involved maxPayloadSize and maxAttributeSize are read from + pCreateInfo->pipelineInfo.payloadSizeMaxInLib = pIn->pLibraryInterface->maxPipelineRayPayloadSize; + pCreateInfo->pipelineInfo.attributeSizeMaxInLib = pIn->pLibraryInterface->maxPipelineRayHitAttributeSize; + } - size_t pipelineInfoBufferSize = pShaderInfo->stageCount * sizeof(Vkgc::PipelineShaderInfo); - size_t tempBufferSize = pipelineInfoBufferSize; + if (hasLibraries) + { + // pipeline library, or pipeline that contains pipeline library(s) + pCreateInfo->pipelineInfo.hasPipelineLibrary = true; + } + else + { + pCreateInfo->pipelineInfo.hasPipelineLibrary = false; + } - size_t genericMappingBufferSize = 0; - if (pLayout != nullptr) - { - genericMappingBufferSize = pLayout->GetPipelineInfo()->mappingBufferSize; + size_t pipelineInfoBufferSize = pShaderInfo->stageCount * sizeof(Vkgc::PipelineShaderInfo); + size_t tempBufferSize = pipelineInfoBufferSize; - tempBufferSize += genericMappingBufferSize + pCreateInfo->mappingBufferSize; - } + size_t genericMappingBufferSize = 0; + if (pLayout != nullptr) + { + genericMappingBufferSize = pLayout->GetPipelineInfo()->mappingBufferSize; - // We can't have a pipeline with 0 shader stages - VK_ASSERT(tempBufferSize > 0); + tempBufferSize += genericMappingBufferSize + pCreateInfo->mappingBufferSize; + } - pCreateInfo->pTempBuffer = pInstance->AllocMem(tempBufferSize, - VK_DEFAULT_MEM_ALIGN, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + // We can't have a pipeline with 0 shader stages + VK_ASSERT(tempBufferSize > 0); - size_t tempBufferOffset = 0; + pCreateInfo->pTempBuffer = pInstance->AllocMem(tempBufferSize, + VK_DEFAULT_MEM_ALIGN, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (pCreateInfo->pTempBuffer == nullptr) - { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - } - else - { - // NOTE: Zero the allocated space that is used to create pipeline resource mappings. Some - // fields of resource mapping nodes are unused for certain node types. We must initialize - // them to zeroes. - memset(pCreateInfo->pTempBuffer, 0, tempBufferSize); + size_t tempBufferOffset = 0; - if ((pLayout != nullptr) && (pLayout->GetPipelineInfo()->mappingBufferSize > 0)) + if (pCreateInfo->pTempBuffer == nullptr) { - pCreateInfo->pipelineInfo.pipelineLayoutApiHash = pLayout->GetApiHash(); - - pCreateInfo->pMappingBuffer = pCreateInfo->pTempBuffer; - tempBufferOffset += pCreateInfo->mappingBufferSize; - - constexpr uint32_t RayTracingStageMask = Vkgc::ShaderStageRayTracingRayGenBit | - Vkgc::ShaderStageRayTracingIntersectBit | - Vkgc::ShaderStageRayTracingAnyHitBit | - Vkgc::ShaderStageRayTracingClosestHitBit | - Vkgc::ShaderStageRayTracingMissBit | - Vkgc::ShaderStageRayTracingCallableBit; + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + else + { + // NOTE: Zero the allocated space that is used to create pipeline resource mappings. Some + // fields of resource mapping nodes are unused for certain node types. We must initialize + // them to zeroes. + memset(pCreateInfo->pTempBuffer, 0, tempBufferSize); - // Build the LLPC resource mapping description. This data contains things about how shader - // inputs like descriptor set bindings are communicated to this pipeline in a form that - // LLPC can understand. - result = pLayout->BuildLlpcPipelineMapping(RayTracingStageMask, - nullptr, - false, + if ((pLayout != nullptr) && (pLayout->GetPipelineInfo()->mappingBufferSize > 0)) + { + pCreateInfo->pipelineInfo.pipelineLayoutApiHash = pLayout->GetApiHash(); + + pCreateInfo->pMappingBuffer = pCreateInfo->pTempBuffer; + tempBufferOffset += pCreateInfo->mappingBufferSize; + + constexpr uint32_t RayTracingStageMask = Vkgc::ShaderStageRayTracingRayGenBit | + Vkgc::ShaderStageRayTracingIntersectBit | + Vkgc::ShaderStageRayTracingAnyHitBit | + Vkgc::ShaderStageRayTracingClosestHitBit | + Vkgc::ShaderStageRayTracingMissBit | + Vkgc::ShaderStageRayTracingCallableBit; + + // Build the LLPC resource mapping description. This data contains things about how shader + // inputs like descriptor set bindings are communicated to this pipeline in a form that + // LLPC can understand. + result = pLayout->BuildLlpcPipelineMapping(RayTracingStageMask, + nullptr, + false, #if VKI_RAY_TRACING - isReplay, + isReplay, #endif - Util::VoidPtrInc(pCreateInfo->pTempBuffer, tempBufferOffset), - &pCreateInfo->pipelineInfo.resourceMapping, - &pCreateInfo->pipelineInfo.options.resourceLayoutScheme); + Util::VoidPtrInc(pCreateInfo->pTempBuffer, tempBufferOffset), + &pCreateInfo->pipelineInfo.resourceMapping, + &pCreateInfo->pipelineInfo.options.resourceLayoutScheme); - tempBufferOffset += genericMappingBufferSize; + tempBufferOffset += genericMappingBufferSize; + } } - } - if (result == VK_SUCCESS) - { - pCreateInfo->pipelineInfo.shaderCount = pShaderInfo->stageCount; - pCreateInfo->pipelineInfo.pShaderGroups = pIn->pGroups; - pCreateInfo->pipelineInfo.shaderGroupCount = pIn->groupCount; - pCreateInfo->pipelineInfo.pShaders = static_cast( - Util::VoidPtrInc(pCreateInfo->pTempBuffer, tempBufferOffset)); - tempBufferOffset += pipelineInfoBufferSize; - - uint32_t nonRayGenCount = 0; - bool shaderCanInline = (settings.rtCompileMode != RtCompileMode::RtCompileModeIndirect); - - for (uint32_t i = 0; i < pShaderInfo->stageCount; ++i) + if (result == VK_SUCCESS) { - pCreateInfo->pipelineInfo.pShaders[i].pModuleData = - ShaderModule::GetFirstValidShaderData(pShaderInfo->pStages[i].pModuleHandle); - pCreateInfo->pipelineInfo.pShaders[i].pSpecializationInfo = - pShaderInfo->pStages[i].pSpecializationInfo; - pCreateInfo->pipelineInfo.pShaders[i].pEntryTarget = pShaderInfo->pStages[i].pEntryPoint; - pCreateInfo->pipelineInfo.pShaders[i].entryStage = pShaderInfo->pStages[i].stage; + pCreateInfo->pipelineInfo.shaderCount = pShaderInfo->stageCount; + pCreateInfo->pipelineInfo.pShaderGroups = pIn->pGroups; + pCreateInfo->pipelineInfo.shaderGroupCount = pIn->groupCount; + pCreateInfo->pipelineInfo.pShaders = static_cast( + Util::VoidPtrInc(pCreateInfo->pTempBuffer, tempBufferOffset)); + tempBufferOffset += pipelineInfoBufferSize; - if (pShaderInfo->pStages[i].stage != ShaderStage::ShaderStageRayTracingRayGen) - { - ++nonRayGenCount; - } + uint32_t nonRayGenCount = 0; + bool shaderCanInline = (settings.rtCompileMode != RtCompileMode::RtCompileModeIndirect); - if (shaderCanInline && (settings.shaderInlineFlags != ShaderInlineFlags::InlineAll)) + for (uint32_t i = 0; i < pShaderInfo->stageCount; ++i) { - switch (pShaderInfo->pStages[i].stage) + pCreateInfo->pipelineInfo.pShaders[i].pModuleData = + ShaderModule::GetFirstValidShaderData(pShaderInfo->pStages[i].pModuleHandle); + pCreateInfo->pipelineInfo.pShaders[i].pSpecializationInfo = + pShaderInfo->pStages[i].pSpecializationInfo; + pCreateInfo->pipelineInfo.pShaders[i].pEntryTarget = pShaderInfo->pStages[i].pEntryPoint; + pCreateInfo->pipelineInfo.pShaders[i].entryStage = pShaderInfo->pStages[i].stage; + + if (pShaderInfo->pStages[i].stage != ShaderStage::ShaderStageRayTracingRayGen) { - case ShaderStage::ShaderStageRayTracingRayGen: - // Raygen can always be inlined. - break; - case ShaderStage::ShaderStageRayTracingMiss: - shaderCanInline = - Util::TestAnyFlagSet(settings.shaderInlineFlags, ShaderInlineFlags::InlineMissShader); - break; - case ShaderStage::ShaderStageRayTracingClosestHit: - shaderCanInline = - Util::TestAnyFlagSet(settings.shaderInlineFlags, ShaderInlineFlags::InlineClosestHitShader); - break; - case ShaderStage::ShaderStageRayTracingAnyHit: - shaderCanInline = - Util::TestAnyFlagSet(settings.shaderInlineFlags, ShaderInlineFlags::InlineAnyHitShader); - break; - case ShaderStage::ShaderStageRayTracingIntersect: - shaderCanInline = - Util::TestAnyFlagSet(settings.shaderInlineFlags, ShaderInlineFlags::InlineIntersectionShader); - break; - case ShaderStage::ShaderStageRayTracingCallable: - shaderCanInline = - Util::TestAnyFlagSet(settings.shaderInlineFlags, ShaderInlineFlags::InlineCallableShader); - break; - default: - VK_NEVER_CALLED(); - break; + ++nonRayGenCount; } - } - } - const uint32_t raygenCount = pShaderInfo->stageCount - nonRayGenCount; - - pCreateInfo->allowShaderInlining = (shaderCanInline && - (nonRayGenCount <= settings.maxUnifiedNonRayGenShaders) && - (raygenCount <= settings.maxUnifiedRayGenShaders)); - // if it is a pipeline library, or a main pipeline which would link to a library, - // force indirect path by set pCreateInfo->allowShaderInlining = false - if (isLibrary || hasLibraries) - { - pCreateInfo->allowShaderInlining = false; - } - - { - pCreateInfo->compilerType = CheckCompilerType(&pCreateInfo->pipelineInfo); - } + if (shaderCanInline && (settings.shaderInlineFlags != ShaderInlineFlags::InlineAll)) + { + switch (pShaderInfo->pStages[i].stage) + { + case ShaderStage::ShaderStageRayTracingRayGen: + // Raygen can always be inlined. + break; + case ShaderStage::ShaderStageRayTracingMiss: + shaderCanInline = + Util::TestAnyFlagSet(settings.shaderInlineFlags, ShaderInlineFlags::InlineMissShader); + break; + case ShaderStage::ShaderStageRayTracingClosestHit: + shaderCanInline = + Util::TestAnyFlagSet(settings.shaderInlineFlags, ShaderInlineFlags::InlineClosestHitShader); + break; + case ShaderStage::ShaderStageRayTracingAnyHit: + shaderCanInline = + Util::TestAnyFlagSet(settings.shaderInlineFlags, ShaderInlineFlags::InlineAnyHitShader); + break; + case ShaderStage::ShaderStageRayTracingIntersect: + shaderCanInline = + Util::TestAnyFlagSet(settings.shaderInlineFlags, ShaderInlineFlags::InlineIntersectionShader); + break; + case ShaderStage::ShaderStageRayTracingCallable: + shaderCanInline = + Util::TestAnyFlagSet(settings.shaderInlineFlags, ShaderInlineFlags::InlineCallableShader); + break; + default: + VK_NEVER_CALLED(); + break; + } + } + } - for (uint32_t i = 0; i < pShaderInfo->stageCount; ++i) - { - ApplyDefaultShaderOptions(pShaderInfo->pStages[i].stage, - pShaderInfo->pStages[i].flags, - &pCreateInfo->pipelineInfo.pShaders[i].options); - } + const uint32_t raygenCount = pShaderInfo->stageCount - nonRayGenCount; - if (pCreateInfo->compilerType == PipelineCompilerTypeLlpc) - { - // TODO: move it to llpc - if (pCreateInfo->allowShaderInlining) + pCreateInfo->allowShaderInlining = (shaderCanInline && + (nonRayGenCount <= settings.maxUnifiedNonRayGenShaders) && + (raygenCount <= settings.maxUnifiedRayGenShaders)); + // if it is a pipeline library, or a main pipeline which would link to a library, + // force indirect path by set pCreateInfo->allowShaderInlining = false + if (isLibrary || hasLibraries) { - pCreateInfo->pipelineInfo.indirectStageMask = 0; + pCreateInfo->allowShaderInlining = false; } - uint32_t vgprLimit = - m_compilerSolutionLlpc.GetRayTracingVgprLimit(pCreateInfo->pipelineInfo.indirectStageMask != 0); + { + pCreateInfo->compilerType = CheckCompilerType(&pCreateInfo->pipelineInfo); + } for (uint32_t i = 0; i < pShaderInfo->stageCount; ++i) { - pCreateInfo->pipelineInfo.pShaders[i].options.vgprLimit = vgprLimit; + ApplyDefaultShaderOptions(pShaderInfo->pStages[i].stage, + pShaderInfo->pStages[i].flags, + &pCreateInfo->pipelineInfo.pShaders[i].options); } - } - { - for (uint32_t i = 0; i < pShaderInfo->stageCount; ++i) + if (pCreateInfo->compilerType == PipelineCompilerTypeLlpc) { + // TODO: move it to llpc + if (pCreateInfo->allowShaderInlining) + { + pCreateInfo->pipelineInfo.indirectStageMask = 0; + } - ApplyProfileOptions(pDevice, - i, - &pCreateInfo->pipelineInfo.options, - &pCreateInfo->pipelineInfo.pShaders[i], - pPipelineProfileKey, - nullptr); + uint32_t vgprLimit = + m_compilerSolutionLlpc.GetRayTracingVgprLimit(pCreateInfo->pipelineInfo.indirectStageMask != 0); + + for (uint32_t i = 0; i < pShaderInfo->stageCount; ++i) + { + pCreateInfo->pipelineInfo.pShaders[i].options.vgprLimit = vgprLimit; + } } - } + { + for (uint32_t i = 0; i < pShaderInfo->stageCount; ++i) + { - SetRayTracingState(pDevice, &pCreateInfo->pipelineInfo.rtState, pCreateInfo->flags); + ApplyProfileOptions(pDevice, + i, + &pCreateInfo->pipelineInfo.options, + &pCreateInfo->pipelineInfo.pShaders[i], + pPipelineProfileKey, + nullptr); + } + + } + SetRayTracingState(pDevice, &pCreateInfo->pipelineInfo.rtState, pCreateInfo->flags); + } } return result; @@ -3962,7 +3982,6 @@ void PipelineCompiler::SetRayTracingState( RayTracingPipeline::ConvertStaticPipelineFlags(pDevice, &pRtState->staticPipelineFlags, - &pRtState->triCompressMode, &pRtState->counterMode, pRtState->pipelineFlags ); @@ -4372,12 +4391,12 @@ Util::Result PipelineCompiler::RegisterAndLoadReinjectionBinary( #endif // ===================================================================================================================== -// Filter VkPipelineCreateFlags to only values used for pipeline caching -static PipelineCreateFlags GetCacheIdControlFlags( - PipelineCreateFlags in) +// Filter VkPipelineCreateFlags2KHR to only values used for pipeline caching +static VkPipelineCreateFlags2KHR GetCacheIdControlFlags( + VkPipelineCreateFlags2KHR in) { // The following flags should NOT affect cache computation - static constexpr PipelineCreateFlags CacheIdIgnoreFlags = { 0 + static constexpr VkPipelineCreateFlags2KHR CacheIdIgnoreFlags = { 0 | VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR | VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR | VK_PIPELINE_CREATE_DERIVATIVE_BIT @@ -4394,7 +4413,7 @@ static PipelineCreateFlags GetCacheIdControlFlags( // properties as well as options to avoid user error when changing performance tuning, compiler, or any other settings. static void GetCommonPipelineCacheId( uint32_t deviceIdx, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const PipelineOptimizerKey* pPipelineProfileKey, PipelineCompilerType compilerType, uint64_t pipelineHash, @@ -4486,6 +4505,7 @@ void PipelineCompiler::GetColorExportShaderCacheId( // Update hash based on fragment output state hash.Update(pCreateInfo->pipelineInfo.iaState.enableMultiView); + hash.Update(pCreateInfo->pBinaryMetadata->dualSrcBlendingUsed); const auto& cbState = pCreateInfo->pipelineInfo.cbState; hash.Update(cbState.alphaToCoverageEnable); @@ -4772,7 +4792,8 @@ size_t PipelineCompiler::GetUberFetchShaderInternalDataSize( } VK_ASSERT(maxLocation < Vkgc::MaxVertexAttribs); - memSize = static_cast(sizeof(Vkgc::UberFetchShaderAttribInfo) * (maxLocation + 1)); + memSize = + static_cast(sizeof(Vkgc::UberFetchShaderAttribInfo) * (maxLocation + 1)) + sizeof(uint64_t); } return memSize; @@ -5070,4 +5091,22 @@ uint32_t PipelineCompiler::BuildUberFetchShaderInternalData( dynamicStride, pUberFetchShaderInternalData); } + +// ===================================================================================================================== +// Template instantiation needed for references in other files. Linux complains if we don't do this. + +template +PipelineCompilerType PipelineCompiler::CheckCompilerType( + const Vkgc::ComputePipelineBuildInfo* pPipelineBuildInfo); + +template +PipelineCompilerType PipelineCompiler::CheckCompilerType( + const Vkgc::GraphicsPipelineBuildInfo* pPipelineBuildInfo); + +#if VKI_RAY_TRACING +template +PipelineCompilerType PipelineCompiler::CheckCompilerType( + const Vkgc::RayTracingPipelineBuildInfo* pPipelineBuildInfo); +#endif + } diff --git a/icd/api/raytrace/ray_tracing_device.cpp b/icd/api/raytrace/ray_tracing_device.cpp index ef0d90ec..d7bba065 100644 --- a/icd/api/raytrace/ray_tracing_device.cpp +++ b/icd/api/raytrace/ray_tracing_device.cpp @@ -182,8 +182,25 @@ void RayTracingDevice::CreateGpuRtDeviceSettings( pDeviceSettings->enableMortonCode30 = settings.rtEnableMortonCode30; pDeviceSettings->enableVariableBitsMortonCodes = settings.enableVariableBitsMortonCodes; pDeviceSettings->enablePrefixScanDLB = settings.rtEnablePrefixScanDLB; - pDeviceSettings->triangleCompressionAutoMode = - ConvertGpuRtTriCompressionAutoMode(settings.rtTriangleCompressionAutoMode); + + switch (settings.rtTriangleCompressionMode) + { + case NoTriangleCompression: + pDeviceSettings->triangleCompressionAutoMode = GpuRt::TriangleCompressionAutoMode::Disabled; + break; + case PairTriangleCompression: + pDeviceSettings->triangleCompressionAutoMode = GpuRt::TriangleCompressionAutoMode::AlwaysEnabled; + break; + case AutoTriangleCompression: + pDeviceSettings->triangleCompressionAutoMode = + ConvertGpuRtTriCompressionAutoMode(settings.rtTriangleCompressionAutoMode); + break; + default: + VK_NEVER_CALLED(); + pDeviceSettings->triangleCompressionAutoMode = GpuRt::TriangleCompressionAutoMode::Disabled; + break; + } + pDeviceSettings->bvhBuildModeDefault = ConvertGpuRtBvhBuildMode(settings.rtBvhBuildModeDefault); pDeviceSettings->bvhBuildModeFastTrace = ConvertGpuRtBvhBuildMode(settings.rtBvhBuildModeFastTrace); pDeviceSettings->bvhBuildModeFastBuild = ConvertGpuRtBvhBuildMode(settings.rtBvhBuildModeFastBuild); @@ -675,7 +692,7 @@ Pal::Result RayTracingDevice::ClientCreateInternalComputePipeline( const GpuRt::DeviceInitInfo& initInfo, ///< [in] Information about the host device const GpuRt::PipelineBuildInfo& buildInfo, ///< [in] Information about the pipeline to be built const GpuRt::CompileTimeConstants& compileConstants, ///< [in] Compile time constant buffer description - Pal::IPipeline** ppResultPipeline, ///< [out] Result PAL pipeline object pointer + ClientPipelineHandle* pResultPipeline, ///< [out] Result PAL pipeline object pointer void** ppResultMemory) ///< [out] (Optional) Result PAL pipeline memory, ///< if different from obj { @@ -818,7 +835,7 @@ Pal::Result RayTracingDevice::ClientCreateInternalComputePipeline( &specializationInfo, &pDevice->GetInternalRayTracingPipeline()); - *ppResultPipeline = pDevice->GetInternalRayTracingPipeline().pPipeline[0]; + *pResultPipeline = pDevice->GetInternalRayTracingPipeline().pPipeline[0]; return result == VK_SUCCESS ? Pal::Result::Success : Pal::Result::ErrorUnknown; } @@ -828,10 +845,11 @@ Pal::Result RayTracingDevice::ClientCreateInternalComputePipeline( // Destroy one of gpurt's internal pipelines. void RayTracingDevice::ClientDestroyInternalComputePipeline( const GpuRt::DeviceInitInfo& initInfo, - Pal::IPipeline* pPipeline, + ClientPipelineHandle pipeline, void* pMemory) { - vk::Device* pDevice = reinterpret_cast(initInfo.pClientUserData); + vk::Device* pDevice = static_cast(initInfo.pClientUserData); + Pal::IPipeline* pPipeline = static_cast(pipeline); if (pMemory == nullptr) { @@ -845,11 +863,12 @@ void RayTracingDevice::ClientDestroyInternalComputePipeline( // ===================================================================================================================== void RayTracingDevice::ClientInsertRGPMarker( - Pal::ICmdBuffer* pCmdBuffer, - const char* pMarker, - bool isPush) + ClientCmdBufferHandle cmdBuffer, + const char* pMarker, + bool isPush) { - vk::CmdBuffer* pCmdbuf = reinterpret_cast(pCmdBuffer->GetClientData()); + Pal::ICmdBuffer* pPalCmdbuf = static_cast(cmdBuffer); + vk::CmdBuffer* pCmdbuf = static_cast(pPalCmdbuf->GetClientData()); if ((pCmdbuf != nullptr) && (pCmdbuf->GetSqttState() != nullptr)) { @@ -864,7 +883,7 @@ void RayTracingDevice::ClientInsertRGPMarker( // // We keep this memory around for later and write it out to files. Pal::Result RayTracingDevice::ClientAccelStructBuildDumpEvent( - Pal::ICmdBuffer* pPalCmdbuf, + ClientCmdBufferHandle cmdbuf, const GpuRt::AccelStructInfo& info, const GpuRt::AccelStructBuildInfo& buildInfo, Pal::gpusize* pDumpGpuVirtAddr) @@ -879,7 +898,7 @@ Pal::Result RayTracingDevice::ClientAccelStructBuildDumpEvent( // // We keep this memory around for later and write it out to files. Pal::Result RayTracingDevice::ClientAccelStatsBuildDumpEvent( - Pal::ICmdBuffer* pPalCmdbuf, + ClientCmdBufferHandle cmdbuf, GpuRt::AccelStructInfo* pInfo) { Pal::Result result = Pal::Result::ErrorOutOfGpuMemory; @@ -893,10 +912,10 @@ Pal::Result RayTracingDevice::ClientAccelStatsBuildDumpEvent( Pal::Result RayTracingDevice::ClientAcquireCmdContext( const GpuRt::DeviceInitInfo& initInfo, // GpuRt device info ClientCmdContextHandle* pContext, // (out) Opaque command context handle - Pal::ICmdBuffer** ppCmdBuffer) // (out) Command buffer for GPURT to fill + ClientCmdBufferHandle* pCmdBuffer) // (out) Command buffer for GPURT to fill { VK_ASSERT(initInfo.pClientUserData != nullptr); - VK_ASSERT(ppCmdBuffer != nullptr); + VK_ASSERT(pCmdBuffer != nullptr); VK_ASSERT(pContext != nullptr); Pal::Result result = Pal::Result::Success; @@ -930,8 +949,8 @@ Pal::Result RayTracingDevice::ClientAcquireCmdContext( if (result == Pal::Result::Success) { - *ppCmdBuffer = pCmdContext->pCmdBuffer; - *pContext = reinterpret_cast(pCmdContext); + *pCmdBuffer = pCmdContext->pCmdBuffer; + *pContext = reinterpret_cast(pCmdContext); } return result; @@ -1076,8 +1095,8 @@ void RayTracingDevice::ClientFreeGpuMem( const GpuRt::DeviceInitInfo& initInfo, ClientGpuMemHandle gpuMem) { - vk::Device* pDevice = reinterpret_cast(initInfo.pClientUserData); - vk::InternalMemory* pInternalMemory = reinterpret_cast(gpuMem); + vk::Device* pDevice = static_cast(initInfo.pClientUserData); + vk::InternalMemory* pInternalMemory = static_cast(gpuMem); VK_ASSERT(pInternalMemory != nullptr); diff --git a/icd/api/raytrace/ray_tracing_device.h b/icd/api/raytrace/ray_tracing_device.h index 4e78efdd..16a4ecc5 100644 --- a/icd/api/raytrace/ray_tracing_device.h +++ b/icd/api/raytrace/ray_tracing_device.h @@ -141,33 +141,33 @@ class RayTracingDevice const GpuRt::DeviceInitInfo& initInfo, const GpuRt::PipelineBuildInfo& buildInfo, const GpuRt::CompileTimeConstants& compileConstants, - Pal::IPipeline** ppResultPipeline, + ClientPipelineHandle* pResultPipeline, void** ppResultMemory); static void ClientDestroyInternalComputePipeline( const GpuRt::DeviceInitInfo& initInfo, - Pal::IPipeline* pPipeline, + ClientPipelineHandle pipeline, void* pMemory); static void ClientInsertRGPMarker( - Pal::ICmdBuffer* pCmdBuffer, - const char* pMarker, - bool isPush); + ClientCmdBufferHandle cmdBuffer, + const char* pMarker, + bool isPush); static Pal::Result ClientAccelStructBuildDumpEvent( - Pal::ICmdBuffer* pPalCmdbuf, + ClientCmdBufferHandle cmdbuf, const GpuRt::AccelStructInfo& info, const GpuRt::AccelStructBuildInfo& buildInfo, Pal::gpusize* pDumpGpuVirtAddr); static Pal::Result ClientAccelStatsBuildDumpEvent( - Pal::ICmdBuffer* pPalCmdbuf, + ClientCmdBufferHandle cmdbuf, GpuRt::AccelStructInfo* pInfo); static Pal::Result ClientAcquireCmdContext( const GpuRt::DeviceInitInfo& initInfo, ClientCmdContextHandle* pContext, - Pal::ICmdBuffer** ppCmdBuffer); + ClientCmdBufferHandle* pCmdBuffer); static Pal::Result ClientFlushCmdContext( ClientCmdContextHandle context); diff --git a/icd/api/raytrace/ray_tracing_util.h b/icd/api/raytrace/ray_tracing_util.h index 5c942dd3..e73e58fe 100644 --- a/icd/api/raytrace/ray_tracing_util.h +++ b/icd/api/raytrace/ray_tracing_util.h @@ -36,34 +36,6 @@ namespace vk #define MAKE_GPURT_VERSION(MAJOR, MINOR) ((MAJOR << 16) | MINOR) -// ===================================================================================================================== -// Converts a Vulkan triangle compression mode setting to the GpuRT equivalent of TriangleCompressionMode -inline GpuRt::TriangleCompressionMode ConvertGpuRtTriCompressMode( - TriangleCompressionMode vkMode) - { - GpuRt::TriangleCompressionMode gpuRtMode = GpuRt::TriangleCompressionMode::None; - - switch (vkMode) - { - case NoTriangleCompression: - gpuRtMode = GpuRt::TriangleCompressionMode::None; - break; - case PairTriangleCompression: - gpuRtMode = GpuRt::TriangleCompressionMode::Pair; - break; - case AutoTriangleCompression: - // Driver will do the auto selection, no need to translate to GpuRt::TriangleCompressionMode - gpuRtMode = GpuRt::TriangleCompressionMode::None; - break; - default: - VK_NEVER_CALLED(); - gpuRtMode = GpuRt::TriangleCompressionMode::None; - break; - } - - return gpuRtMode; -} - // ===================================================================================================================== // Converts a Vulkan triangle compression mode setting to the GpuRT equivalent of TriangleCompressionAutoMode inline GpuRt::TriangleCompressionAutoMode ConvertGpuRtTriCompressionAutoMode( diff --git a/icd/api/raytrace/vk_ray_tracing_pipeline.cpp b/icd/api/raytrace/vk_ray_tracing_pipeline.cpp index 57b3c448..39256984 100644 --- a/icd/api/raytrace/vk_ray_tracing_pipeline.cpp +++ b/icd/api/raytrace/vk_ray_tracing_pipeline.cpp @@ -174,7 +174,7 @@ static void GenerateHashFromRayTracingPipelineInterfaceCreateInfo( // - pCreateInfo->layout void RayTracingPipeline::BuildApiHash( const VkRayTracingPipelineCreateInfoKHR* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, Util::MetroHash::Hash* pElfHash, uint64_t* pApiHash) { @@ -411,7 +411,7 @@ VkResult RayTracingPipeline::Destroy( VkResult RayTracingPipeline::CreateImpl( PipelineCache* pPipelineCache, const VkRayTracingPipelineCreateInfoKHR* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const VkAllocationCallbacks* pAllocator, DeferredWorkload* pDeferredWorkload) { @@ -1031,7 +1031,7 @@ VkResult RayTracingPipeline::CreateImpl( { Pal::ShaderLibStats shaderStats = {}; pShaderLibrary->GetShaderFunctionStats( - libFuncList.Data()[i].symbolName.Data(), + libFuncList.Data()[i].symbolName, &shaderStats); pShaderStackSize[libIdx] += shaderStats.cpsStackSizes.frontendSize; // NOTE: Backend stack size is determined across all shaders (functions), no @@ -1044,7 +1044,7 @@ VkResult RayTracingPipeline::CreateImpl( { Pal::ShaderLibStats shaderStats = {}; pShaderLibrary->GetShaderFunctionStats( - pIndirectFuncInfo[libIdx].symbolName.Data(), + pIndirectFuncInfo[libIdx].symbolName, &shaderStats); pShaderStackSize[libIdx] = shaderStats.stackFrameSizeInBytes; } @@ -1500,7 +1500,7 @@ static int32_t DeferredCreateRayTracingPipelineCallback( { VkResult localResult = VK_SUCCESS; const VkRayTracingPipelineCreateInfoKHR* pCreateInfo = &pState->pInfos[index]; - PipelineCreateFlags flags = + VkPipelineCreateFlags2KHR flags = Device::GetPipelineCreateFlags(pCreateInfo); if (pState->skipRemaining == VK_FALSE) @@ -1682,7 +1682,7 @@ VkResult RayTracingPipeline::Create( { VkResult localResult = VK_SUCCESS; const VkRayTracingPipelineCreateInfoKHR* pCreateInfo = &pCreateInfos[i]; - PipelineCreateFlags flags = + VkPipelineCreateFlags2KHR flags = Device::GetPipelineCreateFlags(pCreateInfo); pObjMem = pDevice->AllocApiObject( @@ -1834,7 +1834,9 @@ bool RayTracingPipeline::MapShaderIdToShaderHandle( if (pShaderProp[i].shaderId == *pShaderId) { auto pIndirectFunc = &pIndirectFuncList[pShaderNameMap[i]]; - VK_ASSERT(pIndirectFunc->symbolName.Data() == &pShaderProp[i].name[0]); + VK_ASSERT(strncmp(pIndirectFunc->symbolName.Data(), + &pShaderProp[i].name[0], + pIndirectFunc->symbolName.Length()) == 0); uint64_t gpuVirtAddr = pIndirectFunc->gpuVirtAddr; if (pShaderProp[i].onlyGpuVaLo) { @@ -1988,7 +1990,6 @@ void RayTracingPipeline::UpdatePipelineImplCreateInfo( void RayTracingPipeline::ConvertStaticPipelineFlags( const Device* pDevice, uint32_t* pStaticFlags, - uint32_t* pTriangleCompressMode, uint32_t* pCounterMode, uint32_t pipelineFlags ) @@ -2004,8 +2005,6 @@ void RayTracingPipeline::ConvertStaticPipelineFlags( *pStaticFlags = staticFlags; - *pTriangleCompressMode = static_cast(ConvertGpuRtTriCompressMode(settings.rtTriangleCompressionMode)); - *pCounterMode = static_cast(counterMode); } diff --git a/icd/api/raytrace/vk_ray_tracing_pipeline.h b/icd/api/raytrace/vk_ray_tracing_pipeline.h index e6969bc7..6d137a42 100644 --- a/icd/api/raytrace/vk_ray_tracing_pipeline.h +++ b/icd/api/raytrace/vk_ray_tracing_pipeline.h @@ -150,7 +150,7 @@ class RayTracingPipeline final : public Pipeline, public NonDispatchableu32All = 0; - BufferUsageFlagBits usage = Device::GetBufferUsageFlagBits(pCreateInfo); + VkBufferUsageFlagBits2KHR usage = Device::GetBufferUsageFlagBits(pCreateInfo); pBufferFlags->usageUniformBuffer = (usage & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) ? 1 : 0; #if VKI_RAY_TRACING diff --git a/icd/api/vk_cmdbuffer.cpp b/icd/api/vk_cmdbuffer.cpp index faad2d50..afcb3b55 100644 --- a/icd/api/vk_cmdbuffer.cpp +++ b/icd/api/vk_cmdbuffer.cpp @@ -1559,19 +1559,9 @@ VkResult CmdBuffer::Begin( if (supportedVrsRates & (1 << static_cast(Pal::VrsShadingRate::_1x1))) { Pal::VrsCenterState centerState = {}; - m_allGpuState.vrsRate = {}; + m_allGpuState.vrsRate = {}; - m_allGpuState.vrsRate.flags.exposeVrsPixelsMask = 1; - - // Don't use coarse shading. - m_allGpuState.vrsRate.shadingRate = Pal::VrsShadingRate::_1x1; - - // Set combiner state for for PsIterator and ProvokingVertex - m_allGpuState.vrsRate.combinerState[static_cast(Pal::VrsCombinerStage::PsIterSamples)] = - Pal::VrsCombiner::Override; - - m_allGpuState.vrsRate.combinerState[static_cast(Pal::VrsCombinerStage::ProvokingVertex)] = - Pal::VrsCombiner::Override; + Device::SetDefaultVrsRateParams(&m_allGpuState.vrsRate); utils::IterateMask deviceGroupVrs(GetDeviceMask()); @@ -1583,8 +1573,7 @@ VkResult CmdBuffer::Begin( // A null source image implies 1x1 shading rate for the image combiner stage. PalCmdBuffer(deviceIdx)->CmdBindSampleRateImage(nullptr); - } - while (deviceGroupVrs.IterateNext()); + } while (deviceGroupVrs.IterateNext()); } } @@ -3173,17 +3162,20 @@ void CmdBuffer::DrawMeshTasks( uint32_t y, uint32_t z) { - DbgBarrierPreCmd(DbgBarrierDrawMeshTasks); + if ((x * y * z) > 0) + { + DbgBarrierPreCmd(DbgBarrierDrawMeshTasks); - ValidateGraphicsStates(); + ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0); #endif - PalCmdDrawMeshTasks(x, y, z); + PalCmdDrawMeshTasks(x, y, z); - DbgBarrierPostCmd(DbgBarrierDrawMeshTasks); + DbgBarrierPostCmd(DbgBarrierDrawMeshTasks); + } } // ===================================================================================================================== @@ -3369,48 +3361,6 @@ void CmdBuffer::ClearColorImage( PalCmdSuspendPredication(false); } -// ===================================================================================================================== -bool CmdBuffer::PreBltBindMsaaState( - const Image& image) -{ - const Pal::IMsaaState* const* pBltMsaa = nullptr; - - if (GetPalQueueType() == Pal::QueueTypeUniversal) - { - const Pal::ImageCreateInfo& imgInfo = image.PalImage(DefaultDeviceIndex)->GetImageCreateInfo(); - - if (imgInfo.samples > 1) - { - pBltMsaa = m_pDevice->GetBltMsaaState(imgInfo.samples); - } - - PalCmdBindMsaaStates(pBltMsaa); - } - - return (pBltMsaa != nullptr) ? true : false; -} - -// ===================================================================================================================== -void CmdBuffer::PostBltRestoreMsaaState( - bool bltMsaaState) -{ - if (GetPalQueueType() == Pal::QueueTypeUniversal) - { - if (bltMsaaState && - (m_allGpuState.pGraphicsPipeline != nullptr)) - { - if (m_allGpuState.pGraphicsPipeline->GetPipelineFlags().bindMsaaObject) - { - PalCmdBindMsaaStates(m_allGpuState.pGraphicsPipeline->GetMsaaStates()); - } - else - { - m_allGpuState.dirtyGraphics.msaa = 1; - } - } - } -} - // ===================================================================================================================== // Performs a depth-stencil clear of an image (vkCmdClearDepthStencilImage) void CmdBuffer::ClearDepthStencilImage( @@ -3947,8 +3897,6 @@ void CmdBuffer::PalCmdClearColorImage( { DbgBarrierPreCmd(DbgBarrierClearColor); - bool bltMsaaState = PreBltBindMsaaState(image); - utils::IterateMask deviceGroup(m_curDeviceMask); do @@ -3968,8 +3916,6 @@ void CmdBuffer::PalCmdClearColorImage( } while (deviceGroup.IterateNext()); - PostBltRestoreMsaaState(bltMsaaState); - DbgBarrierPostCmd(DbgBarrierClearColor); } @@ -3988,8 +3934,6 @@ void CmdBuffer::PalCmdClearDepthStencil( { DbgBarrierPreCmd(DbgBarrierClearDepth); - bool bltMsaaState = PreBltBindMsaaState(image); - utils::IterateMask deviceGroup(m_curDeviceMask); do { @@ -4010,8 +3954,6 @@ void CmdBuffer::PalCmdClearDepthStencil( } while (deviceGroup.IterateNext()); - PostBltRestoreMsaaState(bltMsaaState); - DbgBarrierPostCmd(DbgBarrierClearDepth); } @@ -4060,8 +4002,6 @@ void CmdBuffer::PalCmdResolveImage( { DbgBarrierPreCmd(DbgBarrierResolve); - bool bltMsaaState = PreBltBindMsaaState(srcImage); - utils::IterateMask deviceGroup(deviceMask); do { @@ -4079,8 +4019,6 @@ void CmdBuffer::PalCmdResolveImage( } while (deviceGroup.IterateNext()); - PostBltRestoreMsaaState(bltMsaaState); - DbgBarrierPostCmd(DbgBarrierResolve); } @@ -4550,10 +4488,6 @@ void CmdBuffer::LoadOpClearDepthStencil( } } } - else - { - depthLayout = stencilLayout; - } if (pDepthStencilImage != nullptr) { @@ -8640,11 +8574,6 @@ void CmdBuffer::BindTargets( params.depthTarget.pDepthStencilView = pStencilImageView->PalDepthStencilView(deviceIdx); params.depthTarget.stencilLayout = stencilLayout; } - else - { - params.depthTarget.pDepthStencilView = nullptr; - params.depthTarget.stencilLayout = NullLayout; - } const VkRenderingAttachmentInfoKHR* pDepthAttachmentInfo = pRenderingInfo->pDepthAttachment; @@ -8667,11 +8596,6 @@ void CmdBuffer::BindTargets( params.depthTarget.pDepthStencilView = pDepthImageView->PalDepthStencilView(deviceIdx); params.depthTarget.depthLayout = depthLayout; } - else - { - // Set the depthLayout for stencil only formats to avoid incorrect PAL asserts. - params.depthTarget.depthLayout = params.depthTarget.stencilLayout; - } PalCmdBuffer(deviceIdx)->CmdBindTargets(params); @@ -11253,6 +11177,17 @@ void CmdBuffer::ValidateGraphicsStates() const DynamicColorBlend* pColorBlend = nullptr; const DynamicMsaa* pMsaa = nullptr; + if (m_allGpuState.dirtyGraphics.msaa || m_allGpuState.dirtyGraphics.samplePattern) + { + uint32_t enable1xMsaaSampleLocations = + (m_allGpuState.sampleLocationsEnable && (m_allGpuState.msaaCreateInfo.coverageSamples == 1)) ? 1 : 0; + if (m_allGpuState.msaaCreateInfo.flags.enable1xMsaaSampleLocations != enable1xMsaaSampleLocations) + { + m_allGpuState.msaaCreateInfo.flags.enable1xMsaaSampleLocations = enable1xMsaaSampleLocations; + m_allGpuState.dirtyGraphics.msaa = 1; + } + } + utils::IterateMask deviceGroup(m_cbBeginDeviceMask); do { @@ -11417,7 +11352,7 @@ void CmdBuffer::ValidateGraphicsStates() Pal::VrsRateParams vrsRate = m_allGpuState.vrsRate; if (force1x1) { - Force1x1ShaderRate(&vrsRate); + Device::SetDefaultVrsRateParams(&vrsRate); } if (m_allGpuState.minSampleShading > 0.0) @@ -11915,8 +11850,6 @@ void CmdBuffer::SetRasterizationSamples( m_allGpuState.msaaCreateInfo.shaderExportMaskSamples = rasterizationSampleCount; m_allGpuState.msaaCreateInfo.alphaToCoverageSamples = rasterizationSampleCount; m_allGpuState.msaaCreateInfo.occlusionQuerySamples = rasterizationSampleCount; - m_allGpuState.msaaCreateInfo.flags.enable1xMsaaSampleLocations = (rasterizationSampleCount == 1); - m_allGpuState.dirtyGraphics.msaa = 1; } diff --git a/icd/api/vk_cmdbuffer_transfer.cpp b/icd/api/vk_cmdbuffer_transfer.cpp index ecb18703..3caecc21 100644 --- a/icd/api/vk_cmdbuffer_transfer.cpp +++ b/icd/api/vk_cmdbuffer_transfer.cpp @@ -686,6 +686,12 @@ void CmdBuffer::CopyQueryPoolResults( ) { const PalQueryPool* pPool = pBasePool->AsPalQueryPool(); + Pal::QueryResultFlags palFlags = VkToPalQueryResultFlags(flags); + if (pBasePool->GetQueryType() == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT) + { + palFlags = static_cast( + static_cast(palFlags) | static_cast(Pal::QueryResultOnlyPrimNeeded)); + } utils::IterateMask deviceGroup(m_curDeviceMask); do @@ -694,7 +700,7 @@ void CmdBuffer::CopyQueryPoolResults( PalCmdBuffer(deviceIdx)->CmdResolveQuery( *pPool->PalPool(deviceIdx), - VkToPalQueryResultFlags(flags), + palFlags, pPool->PalQueryType(), firstQuery, queryCount, diff --git a/icd/api/vk_compute_pipeline.cpp b/icd/api/vk_compute_pipeline.cpp index 053de500..3271d855 100644 --- a/icd/api/vk_compute_pipeline.cpp +++ b/icd/api/vk_compute_pipeline.cpp @@ -50,7 +50,7 @@ namespace vk // - pCreateInfo->layout void ComputePipeline::BuildApiHash( const VkComputePipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const ComputePipelineShaderStageInfo& stageInfo, Util::MetroHash::Hash* pElfHash, uint64_t* pApiHash) @@ -189,7 +189,7 @@ VkResult ComputePipeline::Create( Device* pDevice, PipelineCache* pPipelineCache, const VkComputePipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const VkAllocationCallbacks* pAllocator, VkPipeline* pPipeline) { diff --git a/icd/api/vk_device.cpp b/icd/api/vk_device.cpp index f87291d3..1365a814 100644 --- a/icd/api/vk_device.cpp +++ b/icd/api/vk_device.cpp @@ -290,8 +290,6 @@ Device::Device( m_retrievedFaultData(false), m_pNullPipelineLayout(nullptr) { - memset(m_pBltMsaaState, 0, sizeof(m_pBltMsaaState)); - memset(m_pQueues, 0, sizeof(m_pQueues)); m_maxVrsShadingRate = {0, 0}; @@ -658,6 +656,17 @@ VkResult Device::Create( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIMITIVES_GENERATED_QUERY_FEATURES_EXT: + { + const VkPhysicalDevicePrimitivesGeneratedQueryFeaturesEXT* pFeaturesExt = + reinterpret_cast(pHeader); + if (pFeaturesExt->primitivesGeneratedQuery) + { + deviceFeatures.primitivesGeneratedQuery = true; + } + break; + } + default: break; } @@ -1185,11 +1194,6 @@ VkResult Device::Initialize( result = CreateInternalPipelines(); } - if (result == VK_SUCCESS) - { - result = CreateBltMsaaStates(); - } - if (result == VK_SUCCESS) { Pal::SamplePatternPalette palette = {}; @@ -1709,11 +1713,6 @@ VkResult Device::Destroy(const VkAllocationCallbacks* pAllocator) } } - for (uint32_t i = 0; i < BltMsaaStateCount; ++i) - { - m_renderStateCache.DestroyMsaaState(&m_pBltMsaaState[i][0], nullptr); - } - DestroyInternalPipelines(); DestroySharedPalCmdAllocator(); @@ -1817,7 +1816,7 @@ VkResult Device::CreateInternalComputePipeline( PipelineCompiler* pCompiler = GetCompiler(DefaultDeviceIndex); ShaderModuleHandle shaderModule = {}; Vkgc::BinaryData pipelineBinary = {}; - ShaderOptimizerKey shaderOptimzierKey = {}; + ShaderOptimizerKey shaderOptimizerKey = {}; PipelineOptimizerKey pipelineOptimizerKey = {}; PipelineMetadata binaryMetadata = {}; @@ -1828,9 +1827,9 @@ VkResult Device::CreateInternalComputePipeline( pCompiler->ApplyPipelineOptions(this, 0, &pipelineBuildInfo.pipelineInfo.options ); - shaderOptimzierKey.stage = ShaderStage::ShaderStageCompute; + shaderOptimizerKey.stage = ShaderStage::ShaderStageCompute; pipelineOptimizerKey.shaderCount = 1; - pipelineOptimizerKey.pShaders = &shaderOptimzierKey; + pipelineOptimizerKey.pShaders = &shaderOptimizerKey; pipelineBuildInfo.pPipelineProfileKey = &pipelineOptimizerKey; pipelineBuildInfo.pBinaryMetadata = &binaryMetadata; @@ -1851,8 +1850,11 @@ VkResult Device::CreateInternalComputePipeline( { // Build pipeline binary auto pShaderInfo = &pipelineBuildInfo.pipelineInfo.cs; - pipelineBuildInfo.compilerType = PipelineCompilerTypeLlpc; - pShaderInfo->pModuleData = shaderModule.pLlpcShaderModule; + + pShaderInfo->pModuleData = ShaderModule::GetFirstValidShaderData(&shaderModule); + pipelineBuildInfo.compilerType = pCompiler->CheckCompilerType(&pipelineBuildInfo.pipelineInfo); + pShaderInfo->pModuleData = ShaderModule::GetShaderData(pipelineBuildInfo.compilerType, &shaderModule); + pShaderInfo->pSpecializationInfo = pSpecializationInfo; pShaderInfo->pEntryTarget = Vkgc::IUtil::GetEntryPointNameFromSpirvBinary(&spvBin); pShaderInfo->entryStage = Vkgc::ShaderStageCompute; @@ -1878,7 +1880,7 @@ VkResult Device::CreateInternalComputePipeline( codeHash, Vkgc::ShaderStage::ShaderStageCompute, codeByteSize, - &shaderOptimzierKey); + &shaderOptimizerKey); PipelineShaderOptionsPtr options = {}; options.pPipelineOptions = &pipelineBuildInfo.pipelineInfo.options; @@ -1899,12 +1901,30 @@ VkResult Device::CreateInternalComputePipeline( if (pCompiler->GetBinaryCache() != nullptr) { - pCompiler->GetComputePipelineCacheId( + // Set up the ELF hash, which is used for indexing the pipeline cache + Util::MetroHash::Hash elfHash = {}; + Util::MetroHash128 elfHasher = {}; + ShaderStageInfo stageInfo = {}; + + stageInfo.stage = ShaderStage::ShaderStageCompute; + stageInfo.codeHash = codeHash; + stageInfo.codeSize = codeByteSize; + stageInfo.pEntryPoint = pShaderInfo->pEntryTarget; + stageInfo.pSpecializationInfo = pSpecializationInfo; + stageInfo.waveSize = pShaderInfo->options.waveSize; + + ComputePipeline::GenerateHashFromShaderStageCreateInfo(stageInfo, &elfHasher); + + elfHasher.Finalize(reinterpret_cast(&elfHash)); + + Pipeline::ElfHashToCacheId( + this, DefaultDeviceIndex, - &pipelineBuildInfo, - Vkgc::IPipelineDumper::GetPipelineHash(&pipelineBuildInfo.pipelineInfo), + elfHash, VkPhysicalDevice(DefaultDeviceIndex)->GetSettingsLoader()->GetSettingsHash(), - &cacheId); + pipelineOptimizerKey, + &cacheId + ); cacheResult = pCompiler->GetCachedPipelineBinary( &cacheId, @@ -1918,13 +1938,17 @@ VkResult Device::CreateInternalComputePipeline( if (cacheResult != Util::Result::Success) { - result = pCompiler->CreateComputePipelineBinary( - this, - DefaultDeviceIndex, - nullptr, - &pipelineBuildInfo, - &pipelineBinary, - &cacheId); + + if (result == VK_SUCCESS) + { + result = pCompiler->CreateComputePipelineBinary( + this, + DefaultDeviceIndex, + nullptr, + &pipelineBuildInfo, + &pipelineBinary, + &cacheId); + } if (result == VK_SUCCESS) { @@ -2532,7 +2556,7 @@ VkResult Device::CreateGraphicsPipelines( for (uint32_t i = 0; i < count; ++i) { const VkGraphicsPipelineCreateInfo* pCreateInfo = &pCreateInfos[i]; - PipelineCreateFlags flags = GetPipelineCreateFlags(pCreateInfo); + VkPipelineCreateFlags2KHR flags = GetPipelineCreateFlags(pCreateInfo); VkResult result = GraphicsPipelineCommon::Create( this, @@ -2583,7 +2607,7 @@ VkResult Device::CreateComputePipelines( for (uint32_t i = 0; i < count; ++i) { const VkComputePipelineCreateInfo* pCreateInfo = &pCreateInfos[i]; - PipelineCreateFlags flags = GetPipelineCreateFlags(pCreateInfo); + VkPipelineCreateFlags2KHR flags = GetPipelineCreateFlags(pCreateInfo); VkResult result = VK_SUCCESS; result = ComputePipeline::Create( @@ -3108,36 +3132,6 @@ void Device::RemoveMemReference( pPalDevice->RemoveGpuMemoryReferences(1, &pPalMemory, nullptr); } -// ===================================================================================================================== -VkResult Device::CreateBltMsaaStates() -{ - Pal::Result palResult = Pal::Result::Success; - - for (uint32_t log2Samples = 0; - (log2Samples < BltMsaaStateCount) && (palResult == Pal::Result::Success); - ++log2Samples) - { - uint32_t samples = (1UL << log2Samples); - - Pal::MsaaStateCreateInfo info = {}; - - info.coverageSamples = samples; - info.exposedSamples = samples; - info.pixelShaderSamples = samples; - info.depthStencilSamples = samples; - info.shaderExportMaskSamples = samples; - info.sampleMask = (1UL << samples) - 1; - info.sampleClusters = 0; - info.alphaToCoverageSamples = 0; - info.occlusionQuerySamples = samples; - - palResult = m_renderStateCache.CreateMsaaState( - info, nullptr, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE, &m_pBltMsaaState[log2Samples][0]); - } - - return PalToVkResult(palResult); -} - // ===================================================================================================================== // Individual VkMemory objects fit some GPU VA base address alignment guarantees. Given a mask of memory type indices, // this function will return the *smallest* possible alignment amongst those types. Note that you can pass in a single @@ -4083,23 +4077,205 @@ VkResult Device::GetDeviceFaultInfoEXT( // ================================================================================================================= template -PipelineCreateFlags Device::GetPipelineCreateFlags( +VkPipelineCreateFlags2KHR Device::GetPipelineCreateFlags( const CreateInfo* pCreateInfo) { - PipelineCreateFlags flags = pCreateInfo->flags; + VkPipelineCreateFlags2KHR flags = pCreateInfo->flags; + + static_assert(VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT == + VK_PIPELINE_CREATE_2_DISABLE_OPTIMIZATION_BIT_KHR, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT == + VK_PIPELINE_CREATE_2_ALLOW_DERIVATIVES_BIT_KHR, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_DERIVATIVE_BIT == + VK_PIPELINE_CREATE_2_DERIVATIVE_BIT_KHR, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_VIEW_INDEX_FROM_DEVICE_INDEX_BIT == + VK_PIPELINE_CREATE_2_VIEW_INDEX_FROM_DEVICE_INDEX_BIT_KHR, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_DISPATCH_BASE_BIT == + VK_PIPELINE_CREATE_2_DISPATCH_BASE_BIT_KHR, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_DEFER_COMPILE_BIT_NV == + VK_PIPELINE_CREATE_2_DEFER_COMPILE_BIT_NV, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR == + VK_PIPELINE_CREATE_2_CAPTURE_STATISTICS_BIT_KHR, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR == + VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT == + VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT == + VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_LINK_TIME_OPTIMIZATION_BIT_EXT == + VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT == + VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_LIBRARY_BIT_KHR == + VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_RAY_TRACING_SKIP_TRIANGLES_BIT_KHR == + VK_PIPELINE_CREATE_2_RAY_TRACING_SKIP_TRIANGLES_BIT_KHR, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_RAY_TRACING_SKIP_AABBS_BIT_KHR == + VK_PIPELINE_CREATE_2_RAY_TRACING_SKIP_AABBS_BIT_KHR, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_RAY_TRACING_NO_NULL_ANY_HIT_SHADERS_BIT_KHR == + VK_PIPELINE_CREATE_2_RAY_TRACING_NO_NULL_ANY_HIT_SHADERS_BIT_KHR, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_RAY_TRACING_NO_NULL_CLOSEST_HIT_SHADERS_BIT_KHR == + VK_PIPELINE_CREATE_2_RAY_TRACING_NO_NULL_CLOSEST_HIT_SHADERS_BIT_KHR, + "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_RAY_TRACING_NO_NULL_MISS_SHADERS_BIT_KHR == + VK_PIPELINE_CREATE_2_RAY_TRACING_NO_NULL_MISS_SHADERS_BIT_KHR, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_RAY_TRACING_NO_NULL_INTERSECTION_SHADERS_BIT_KHR == + VK_PIPELINE_CREATE_2_RAY_TRACING_NO_NULL_INTERSECTION_SHADERS_BIT_KHR, + "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_RAY_TRACING_SHADER_GROUP_HANDLE_CAPTURE_REPLAY_BIT_KHR == + VK_PIPELINE_CREATE_2_RAY_TRACING_SHADER_GROUP_HANDLE_CAPTURE_REPLAY_BIT_KHR, + "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_INDIRECT_BINDABLE_BIT_NV == + VK_PIPELINE_CREATE_2_INDIRECT_BINDABLE_BIT_NV, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_RAY_TRACING_ALLOW_MOTION_BIT_NV == + VK_PIPELINE_CREATE_2_RAY_TRACING_ALLOW_MOTION_BIT_NV, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR == + VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR, + "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT == + VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT, + "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_RAY_TRACING_OPACITY_MICROMAP_BIT_EXT == + VK_PIPELINE_CREATE_2_RAY_TRACING_OPACITY_MICROMAP_BIT_EXT, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT == + VK_PIPELINE_CREATE_2_COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT == + VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT, + "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_NO_PROTECTED_ACCESS_BIT_EXT == + VK_PIPELINE_CREATE_2_NO_PROTECTED_ACCESS_BIT_EXT, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_PROTECTED_ACCESS_ONLY_BIT_EXT == + VK_PIPELINE_CREATE_2_PROTECTED_ACCESS_ONLY_BIT_EXT, "VkPipelineCreateFlags2KHR Flag Mismatch"); + static_assert(VK_PIPELINE_CREATE_DESCRIPTOR_BUFFER_BIT_EXT == + VK_PIPELINE_CREATE_2_DESCRIPTOR_BUFFER_BIT_EXT, "VkPipelineCreateFlags2KHR Flag Mismatch"); + + const void* pNext = pCreateInfo->pNext; + + while (pNext != nullptr) + { + const auto* pHeader = static_cast(pNext); + + switch (static_cast(pHeader->sType)) + { + case VK_STRUCTURE_TYPE_PIPELINE_CREATE_FLAGS_2_CREATE_INFO_KHR: + { + const auto* pCreateFlags = reinterpret_cast(pHeader); + flags = pCreateFlags->flags; + } + break; + + default: + // Skip any unknown extension structures + break; + } + + pNext = pHeader->pNext; + } return flags; } // ================================================================================================================= -BufferUsageFlagBits Device::GetBufferUsageFlagBits( +VkBufferUsageFlagBits2KHR Device::GetBufferUsageFlagBits( const VkBufferCreateInfo* pCreateInfo) { - BufferUsageFlagBits usage = static_cast(pCreateInfo->usage); + VkBufferUsageFlagBits2KHR usage = static_cast(pCreateInfo->usage); + + static_assert(VK_BUFFER_USAGE_TRANSFER_SRC_BIT == VK_BUFFER_USAGE_2_TRANSFER_SRC_BIT_KHR, + "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_TRANSFER_DST_BIT == VK_BUFFER_USAGE_2_TRANSFER_DST_BIT_KHR, + "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT == VK_BUFFER_USAGE_2_UNIFORM_TEXEL_BUFFER_BIT_KHR, + "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT == VK_BUFFER_USAGE_2_STORAGE_TEXEL_BUFFER_BIT_KHR, + "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT == VK_BUFFER_USAGE_2_UNIFORM_BUFFER_BIT_KHR, + "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_STORAGE_BUFFER_BIT == VK_BUFFER_USAGE_2_STORAGE_BUFFER_BIT_KHR, + "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_INDEX_BUFFER_BIT == VK_BUFFER_USAGE_2_INDEX_BUFFER_BIT_KHR, + "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_VERTEX_BUFFER_BIT == VK_BUFFER_USAGE_2_VERTEX_BUFFER_BIT_KHR, + "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT == VK_BUFFER_USAGE_2_INDIRECT_BUFFER_BIT_KHR, + "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT == VK_BUFFER_USAGE_2_CONDITIONAL_RENDERING_BIT_EXT, + "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_SHADER_BINDING_TABLE_BIT_KHR == VK_BUFFER_USAGE_2_SHADER_BINDING_TABLE_BIT_KHR, + "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT == + VK_BUFFER_USAGE_2_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT, "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_COUNTER_BUFFER_BIT_EXT == + VK_BUFFER_USAGE_2_TRANSFORM_FEEDBACK_COUNTER_BUFFER_BIT_EXT, "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_VIDEO_DECODE_SRC_BIT_KHR == VK_BUFFER_USAGE_2_VIDEO_DECODE_SRC_BIT_KHR, + "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_VIDEO_DECODE_DST_BIT_KHR == VK_BUFFER_USAGE_2_VIDEO_DECODE_DST_BIT_KHR, + "VkBufferUsageFlags2KHR Flag Mismatch"); +#ifdef VK_ENABLE_BETA_EXTENSIONS + static_assert(VK_BUFFER_USAGE_VIDEO_ENCODE_DST_BIT_KHR == VK_BUFFER_USAGE_2_VIDEO_ENCODE_DST_BIT_KHR, + "VkBufferUsageFlags2KHR Flag Mismatch"); +#endif +#ifdef VK_ENABLE_BETA_EXTENSIONS + static_assert(VK_BUFFER_USAGE_VIDEO_ENCODE_SRC_BIT_KHR == VK_BUFFER_USAGE_2_VIDEO_ENCODE_SRC_BIT_KHR, + "VkBufferUsageFlags2KHR Flag Mismatch"); +#endif + static_assert(VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR == VK_BUFFER_USAGE_2_SHADER_DEVICE_ADDRESS_BIT_KHR, + "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR == + VK_BUFFER_USAGE_2_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR, + "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR == + VK_BUFFER_USAGE_2_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR, "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT == + VK_BUFFER_USAGE_2_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT, "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT == + VK_BUFFER_USAGE_2_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT, "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_PUSH_DESCRIPTORS_DESCRIPTOR_BUFFER_BIT_EXT == + VK_BUFFER_USAGE_2_PUSH_DESCRIPTORS_DESCRIPTOR_BUFFER_BIT_EXT, "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_MICROMAP_BUILD_INPUT_READ_ONLY_BIT_EXT == + VK_BUFFER_USAGE_2_MICROMAP_BUILD_INPUT_READ_ONLY_BIT_EXT, "VkBufferUsageFlags2KHR Flag Mismatch"); + static_assert(VK_BUFFER_USAGE_MICROMAP_STORAGE_BIT_EXT == VK_BUFFER_USAGE_2_MICROMAP_STORAGE_BIT_EXT, + "VkBufferUsageFlags2KHR Flag Mismatch"); + + const void* pNext = pCreateInfo->pNext; + + while (pNext != nullptr) + { + const auto* pHeader = static_cast(pNext); + + switch (static_cast(pHeader->sType)) + { + case VK_STRUCTURE_TYPE_BUFFER_USAGE_FLAGS_2_CREATE_INFO_KHR: + { + const auto* pCreateFlags = reinterpret_cast(pHeader); + usage = pCreateFlags->usage; + } + break; + + default: + // Skip any unknown extension structures + break; + } + + pNext = pHeader->pNext; + } return usage; } +// ===================================================================================================================== +void Device::SetDefaultVrsRateParams( + Pal::VrsRateParams* pVrsRateParams) +{ + pVrsRateParams->shadingRate = Pal::VrsShadingRate::_1x1; + pVrsRateParams->flags.exposeVrsPixelsMask = 1; + + for (uint32 idx = 0; idx < static_cast(Pal::VrsCombinerStage::Max); idx++) + { + pVrsRateParams->combinerState[idx] = Pal::VrsCombiner::Passthrough; + } +} + /** *********************************************************************************************************************** * C-Callable entry points start here. These entries go in the dispatch table(s). @@ -4455,6 +4631,16 @@ VKAPI_ATTR void VKAPI_CALL vkGetRenderAreaGranularity( pGranularity->height = 1; } +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkGetRenderingAreaGranularityKHR( + VkDevice device, + const VkRenderingAreaInfoKHR* pRenderingAreaInfo, + VkExtent2D* pGranularity) +{ + pGranularity->width = 1; + pGranularity->height = 1; +} + // ===================================================================================================================== VKAPI_ATTR VkResult VKAPI_CALL vkAllocateCommandBuffers( VkDevice device, @@ -5044,12 +5230,37 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetDeviceFaultInfoEXT( return pDevice->GetDeviceFaultInfoEXT(pFaultCounts, pFaultInfo); } +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkGetDeviceImageSubresourceLayoutKHR( + VkDevice device, + const VkDeviceImageSubresourceInfoKHR* pInfo, + VkSubresourceLayout2KHR* pLayout) +{ + Device* pDevice = ApiDevice::ObjectFromHandle(device); + Image::CalculateSubresourceLayout(pDevice, pInfo->pCreateInfo, &pInfo->pSubresource->imageSubresource, + &pLayout->subresourceLayout); +} +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkGetImageSubresourceLayout2KHR( + VkDevice device, + VkImage image, + const VkImageSubresource2KHR* pSubresource, + VkSubresourceLayout2KHR* pLayout) +{ + const Device* pDevice = ApiDevice::ObjectFromHandle(device); + + Image::ObjectFromHandle(image)->GetSubresourceLayout( + pDevice, + &pSubresource->imageSubresource, + &pLayout->subresourceLayout); +} + } // entry } // vk #if VKI_RAY_TRACING template -vk::PipelineCreateFlags vk::Device::GetPipelineCreateFlags( +VkPipelineCreateFlags2KHR vk::Device::GetPipelineCreateFlags( const VkRayTracingPipelineCreateInfoKHR* pCreateInfo); #endif diff --git a/icd/api/vk_dispatch.cpp b/icd/api/vk_dispatch.cpp index ca34ec54..cfbe9400 100644 --- a/icd/api/vk_dispatch.cpp +++ b/icd/api/vk_dispatch.cpp @@ -386,6 +386,7 @@ void DispatchTable::Init() INIT_DISPATCH_ENTRY(vkGetPipelineCacheData ); INIT_DISPATCH_ENTRY(vkGetQueryPoolResults ); INIT_DISPATCH_ENTRY(vkGetRenderAreaGranularity ); + INIT_DISPATCH_ENTRY(vkGetRenderingAreaGranularityKHR ); INIT_DISPATCH_ENTRY(vkGetPhysicalDeviceSurfaceCapabilitiesKHR ); INIT_DISPATCH_ENTRY(vkGetPhysicalDeviceSurfaceCapabilities2KHR ); INIT_DISPATCH_ENTRY(vkGetPhysicalDeviceSurfaceFormatsKHR ); @@ -729,6 +730,8 @@ void DispatchTable::Init() INIT_DISPATCH_ENTRY(vkCmdSetColorWriteEnableEXT ); + INIT_DISPATCH_ENTRY(vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR); + INIT_DISPATCH_ENTRY(vkCmdSetRasterizerDiscardEnable ); INIT_DISPATCH_ENTRY(vkCmdSetPrimitiveRestartEnable ); INIT_DISPATCH_ENTRY(vkCmdSetDepthBiasEnable ); @@ -811,6 +814,10 @@ void DispatchTable::Init() INIT_DISPATCH_ENTRY(vkGetImageDrmFormatModifierPropertiesEXT ); #endif + INIT_DISPATCH_ENTRY(vkGetDeviceImageSubresourceLayoutKHR ); + INIT_DISPATCH_ENTRY(vkGetImageSubresourceLayout2KHR ); + INIT_DISPATCH_ENTRY(vkCmdBindIndexBuffer2KHR ); + } // ===================================================================================================================== diff --git a/icd/api/vk_graphics_pipeline.cpp b/icd/api/vk_graphics_pipeline.cpp index 6d943f82..16ee24c3 100644 --- a/icd/api/vk_graphics_pipeline.cpp +++ b/icd/api/vk_graphics_pipeline.cpp @@ -45,6 +45,7 @@ #include "palVectorImpl.h" #include +#include #include using namespace Util; @@ -57,7 +58,7 @@ namespace vk VkResult GraphicsPipeline::CreatePipelineBinaries( Device* pDevice, const VkGraphicsPipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const GraphicsPipelineShaderStageInfo* pShaderInfo, const PipelineLayout* pPipelineLayout, const Util::MetroHash::Hash* pElfHash, @@ -319,7 +320,7 @@ VkResult GraphicsPipeline::CreatePalPipelineObjects( VkResult GraphicsPipeline::CreatePipelineObjects( Device* pDevice, const VkGraphicsPipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const VkAllocationCallbacks* pAllocator, const PipelineLayout* pPipelineLayout, const VbBindingInfo* pVbInfo, @@ -398,15 +399,27 @@ VkResult GraphicsPipeline::CreatePipelineObjects( { // Override the shader rate to 1x1 if SampleId used in shader or // supportVrsWithDsExports is not supported and SampleMask used in shader. - Force1x1ShaderRate(&pObjectCreateInfo->immedInfo.vrsRateParams); + Device::SetDefaultVrsRateParams(&pObjectCreateInfo->immedInfo.vrsRateParams); + pObjectCreateInfo->flags.force1x1ShaderRate = true; if (pObjectCreateInfo->flags.bindMsaaObject == false) { pObjectCreateInfo->flags.sampleShadingEnable = true; pObjectCreateInfo->immedInfo.minSampleShading = 1.0f; } + pObjectCreateInfo->immedInfo.msaaCreateInfo.pixelShaderSamples = pObjectCreateInfo->immedInfo.msaaCreateInfo.coverageSamples; + + // Both MSAA and VRS would utilize the value of PS_ITER_SAMPLES + // Thus, choose the min combiner (i.e. choose the higher quality rate) when both features are + // enabled + if ((pObjectCreateInfo->immedInfo.msaaCreateInfo.pixelShaderSamples > 1) && + (pObjectCreateInfo->immedInfo.vrsRateParams.flags.exposeVrsPixelsMask == 1)) + { + pObjectCreateInfo->immedInfo.vrsRateParams.combinerState[ + static_cast(Pal::VrsCombinerStage::PsIterSamples)] = Pal::VrsCombiner::Min; + } } } @@ -546,13 +559,92 @@ static bool IsGplFastLinkPossible( return result; } +// ===================================================================================================================== +void DumpGplFastLinkInfo( + const Device* pDevice, + VkPipeline pipeline, + GraphicsPipelineBinaryCreateInfo* pCreateInfo) +{ + const GraphicsPipeline* pGraphicsPipeline = GraphicsPipeline::ObjectFromHandle(pipeline); + const Pal::IPipeline* pPalPipeline = pGraphicsPipeline->GetPalPipeline(DefaultDeviceIndex); + const Pal::PipelineInfo info = pPalPipeline->GetInfo(); + const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); + + uint64_t dumpHash = settings.dumpPipelineWithApiHash ? pCreateInfo->apiPsoHash : info.internalPipelineHash.stable; + + Vkgc::PipelineDumpOptions dumpOptions = {}; + dumpOptions.pDumpDir = settings.pipelineDumpDir; + dumpOptions.filterPipelineDumpByType = settings.filterPipelineDumpByType; + dumpOptions.filterPipelineDumpByHash = settings.filterPipelineDumpByHash; + dumpOptions.dumpDuplicatePipelines = settings.dumpDuplicatePipelines; + + Vkgc::PipelineBuildInfo pipelineInfo = {}; + pCreateInfo->pipelineInfo.unlinked = false; + pipelineInfo.pGraphicsInfo = &pCreateInfo->pipelineInfo; + + void* pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump(&dumpOptions, pipelineInfo, dumpHash); + if (pPipelineDumpHandle != nullptr) + { + char extraInfo[256] = {}; + + Util::Snprintf( + extraInfo, + sizeof(extraInfo), + "; ApiPsoHash: 0x%016" PRIX64 "\n", + pCreateInfo->apiPsoHash); + Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, extraInfo); + for (uint32_t i = 0; i < GraphicsLibraryCount; i++) + { + if (pCreateInfo->pShaderLibraries[i] == nullptr) + { + continue; + } + const Pal::LibraryInfo& libInfo = pCreateInfo->pShaderLibraries[i]->GetInfo(); + Util::Snprintf( + extraInfo, + sizeof(extraInfo), + "; GraphicsPipelineLibrary Hash: 0x%016" PRIX64 "\n", + libInfo.internalLibraryHash.stable); + Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, extraInfo); + } + + for (uint32_t i = 0; i < GraphicsLibraryCount; i++) + { + if (pCreateInfo->pShaderLibraries[i] == nullptr) + { + continue; + } + uint32_t codeSize = 0; + Pal::Result result = pCreateInfo->pShaderLibraries[i]->GetCodeObject(&codeSize, nullptr); + if ((codeSize > 0) && (result == Pal::Result::Success)) + { + void* pCode = pDevice->VkInstance()->AllocMem(codeSize, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (pCode != nullptr) + { + result = pCreateInfo->pShaderLibraries[i]->GetCodeObject(&codeSize, pCode); + VK_ASSERT(result == Pal::Result::Success); + + Vkgc::BinaryData libraryBinary = {}; + libraryBinary.codeSize = codeSize; + libraryBinary.pCode = pCode; + Vkgc::IPipelineDumper::DumpPipelineBinary( + pPipelineDumpHandle, pDevice->GetCompiler(DefaultDeviceIndex)->GetGfxIp(), &libraryBinary); + + pDevice->VkInstance()->FreeMem(pCode); + } + } + } + Vkgc::IPipelineDumper::EndPipelineDump(pPipelineDumpHandle); + } +} + // ===================================================================================================================== // Create a graphics pipeline object. VkResult GraphicsPipeline::Create( Device* pDevice, PipelineCache* pPipelineCache, const VkGraphicsPipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const VkAllocationCallbacks* pAllocator, VkPipeline* pPipeline) { @@ -614,6 +706,7 @@ VkResult GraphicsPipeline::Create( if (result == VK_SUCCESS) { shaderLibraries[numShaderLibraries++] = pColorExportLib; + binaryCreateInfo.pShaderLibraries[GraphicsLibraryColorExport] = pColorExportLib; } } } @@ -626,6 +719,12 @@ VkResult GraphicsPipeline::Create( { objectCreateInfo.pipeline.ppShaderLibraries = shaderLibraries; objectCreateInfo.pipeline.numShaderLibraries = numShaderLibraries; + if ((pDevice->VkInstance()->GetDevModeMgr() != nullptr) || + pDevice->GetRuntimeSettings().enablePipelineDump) + { + BuildApiHash(pCreateInfo, flags, &apiPsoHash, &elfHash); + binaryCreateInfo.apiPsoHash = apiPsoHash; + } enableFastLink = true; } } @@ -637,7 +736,7 @@ VkResult GraphicsPipeline::Create( result = BuildShaderStageInfo(pDevice, pCreateInfo->stageCount, pCreateInfo->pStages, - flags & VK_PIPELINE_CREATE_LIBRARY_BIT_KHR, + Util::TestAnyFlagSet(flags, VK_PIPELINE_CREATE_LIBRARY_BIT_KHR), [](const uint32_t inputIdx, const uint32_t stageIdx) { return stageIdx; @@ -810,6 +909,11 @@ VkResult GraphicsPipeline::Create( PipelineCompileTime, "0x%016llX-%llu", apiPsoHash, duration); + + if (enableFastLink && pDevice->GetRuntimeSettings().enablePipelineDump) + { + DumpGplFastLinkInfo(pDevice, *pPipeline, &binaryCreateInfo); + } } return result; diff --git a/icd/api/vk_graphics_pipeline_library.cpp b/icd/api/vk_graphics_pipeline_library.cpp index 7debb3bb..4ba52659 100644 --- a/icd/api/vk_graphics_pipeline_library.cpp +++ b/icd/api/vk_graphics_pipeline_library.cpp @@ -440,7 +440,7 @@ VkResult GraphicsPipelineLibrary::Create( Device* pDevice, PipelineCache* pPipelineCache, const VkGraphicsPipelineCreateInfo* pCreateInfo, - PipelineCreateFlags flags, + VkPipelineCreateFlags2KHR flags, const VkAllocationCallbacks* pAllocator, VkPipeline* pPipeline) { @@ -466,7 +466,7 @@ VkResult GraphicsPipelineLibrary::Create( result = BuildShaderStageInfo(pDevice, pCreateInfo->stageCount, pCreateInfo->pStages, - pCreateInfo->flags & VK_PIPELINE_CREATE_LIBRARY_BIT_KHR, + Util::TestAnyFlagSet(flags, VK_PIPELINE_CREATE_LIBRARY_BIT_KHR), [](const uint32_t inputIdx, const uint32_t stageIdx) { return stageIdx; diff --git a/icd/api/vk_image.cpp b/icd/api/vk_image.cpp index 3ddb13ec..4e252ebb 100644 --- a/icd/api/vk_image.cpp +++ b/icd/api/vk_image.cpp @@ -296,10 +296,10 @@ void Image::ConvertImageCreateInfo( pPalCreateInfo->flags.optimalShareable = 1; } - if (((pCreateInfo->flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT) == 0) && + if (((pCreateInfo->flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT) == 0) && ((pCreateInfo->flags & VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT) != 0) && - (pCreateInfo->mipLevels > 1) && - Pal::Formats::IsBlockCompressed(pPalCreateInfo->swizzledFormat.format) && + (pCreateInfo->mipLevels > 1) && + Pal::Formats::IsBlockCompressed(pPalCreateInfo->swizzledFormat.format) && (pCreateInfo->imageType == VK_IMAGE_TYPE_3D)) { pPalCreateInfo->flags.view3dAs2dArray = 1; diff --git a/icd/api/vk_image_view.cpp b/icd/api/vk_image_view.cpp index 3a5f8359..cbb0bbec 100644 --- a/icd/api/vk_image_view.cpp +++ b/icd/api/vk_image_view.cpp @@ -559,15 +559,9 @@ VkResult ImageView::Create( { // Image views having both DEPTH_BIT and STENCIL_BIT specified in the aspectMask cannot be used as a sampled // image view, only as attachment, so check the condition before trying to generate any SRDs for the view. - // - // Also note that, for 2D array compatible 3D images, SRDs should only be created for 3D image views. Trying - // to use atomic/load/store ops against 2D and 2D array image views created from such images is illegal from the API - // PoV, and triggers an assertion failure in PAL. const VkImageAspectFlags combinedDsView = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; - if ((pCreateInfo->subresourceRange.aspectMask & combinedDsView) != combinedDsView && - ( !pImage->Is2dArrayCompatible() || - (pImage->Is2dArrayCompatible() && pCreateInfo->viewType == VK_IMAGE_VIEW_TYPE_3D)) ) + if ((pCreateInfo->subresourceRange.aspectMask & combinedDsView) != combinedDsView) { srdSegmentOffset = totalSize; srdSegmentSize = srdSize * SrdCount; diff --git a/icd/api/vk_physical_device.cpp b/icd/api/vk_physical_device.cpp index de25388a..201f3c20 100644 --- a/icd/api/vk_physical_device.cpp +++ b/icd/api/vk_physical_device.cpp @@ -128,6 +128,38 @@ constexpr VkFormatFeatureFlags AllBufFeatures = #endif VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT; +struct CooperativeMatrixType +{ + VkComponentTypeKHR a; + VkComponentTypeKHR b; + VkComponentTypeKHR c; +}; + +constexpr CooperativeMatrixType CooperativeMatrixTypes[] = +{ + { VK_COMPONENT_TYPE_FLOAT16_KHR, VK_COMPONENT_TYPE_FLOAT16_KHR, VK_COMPONENT_TYPE_FLOAT32_KHR }, + { VK_COMPONENT_TYPE_FLOAT16_KHR, VK_COMPONENT_TYPE_FLOAT16_KHR, VK_COMPONENT_TYPE_FLOAT16_KHR }, + { VK_COMPONENT_TYPE_UINT8_KHR, VK_COMPONENT_TYPE_UINT8_KHR, VK_COMPONENT_TYPE_SINT32_KHR }, + { VK_COMPONENT_TYPE_SINT8_KHR, VK_COMPONENT_TYPE_SINT8_KHR, VK_COMPONENT_TYPE_SINT32_KHR }, + { VK_COMPONENT_TYPE_UINT8_KHR, VK_COMPONENT_TYPE_SINT8_KHR, VK_COMPONENT_TYPE_SINT32_KHR }, + { VK_COMPONENT_TYPE_SINT8_KHR, VK_COMPONENT_TYPE_UINT8_KHR, VK_COMPONENT_TYPE_SINT32_KHR }, +}; + +constexpr uint32_t CooperativeMatrixTypesCount = VK_ARRAY_SIZE(CooperativeMatrixTypes); + +constexpr CooperativeMatrixType CooperativeMatrixSaturatingTypes[] = +{ + { VK_COMPONENT_TYPE_UINT8_KHR, VK_COMPONENT_TYPE_UINT8_KHR, VK_COMPONENT_TYPE_SINT32_KHR }, + { VK_COMPONENT_TYPE_SINT8_KHR, VK_COMPONENT_TYPE_SINT8_KHR, VK_COMPONENT_TYPE_SINT32_KHR }, + { VK_COMPONENT_TYPE_UINT8_KHR, VK_COMPONENT_TYPE_SINT8_KHR, VK_COMPONENT_TYPE_SINT32_KHR }, + { VK_COMPONENT_TYPE_SINT8_KHR, VK_COMPONENT_TYPE_UINT8_KHR, VK_COMPONENT_TYPE_SINT32_KHR }, +}; + +constexpr uint32_t CooperativeMatrixSaturatingTypesCount = VK_ARRAY_SIZE(CooperativeMatrixSaturatingTypes); + +// Dimension size for M, N and K +constexpr uint32_t CooperativeMatrixDimension = 16; + #if PAL_ENABLE_PRINTS_ASSERTS static void VerifyProperties(const PhysicalDevice& device); #endif @@ -4039,6 +4071,13 @@ bool PhysicalDevice::HwSupportsRayTracing() const } #endif +static bool IsKhrCooperativeMatrixSupported( + const PhysicalDevice* pPhysicalDevice) +{ + return ((pPhysicalDevice == nullptr) || + (pPhysicalDevice->PalProperties().gfxipProperties.flags.supportCooperativeMatrix)); +} + // ===================================================================================================================== // Get available device extensions or populate the specified physical device with the extensions supported by it. // @@ -4241,6 +4280,7 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions( availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_TERMINATE_INVOCATION)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_EXTENDED_DYNAMIC_STATE2)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_FORMAT_FEATURE_FLAGS2)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_FRAME_BOUNDARY)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_DEPTH_CLIP_CONTROL)); @@ -4304,6 +4344,13 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions( availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_BORDER_COLOR_SWIZZLE)); } + if (IsKhrCooperativeMatrixSupported(pPhysicalDevice)) + { + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_COOPERATIVE_MATRIX)); + } + + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_MAINTENANCE5)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_PUSH_DESCRIPTOR)); if ((pPhysicalDevice == nullptr) || @@ -6043,6 +6090,21 @@ size_t PhysicalDevice::GetFeatures2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADER_INTERLOCK_FEATURES_EXT: + { + auto *pExtInfo = reinterpret_cast(pHeader); + + if (updateFeatures) + { + pExtInfo->fragmentShaderSampleInterlock = VK_FALSE; + pExtInfo->fragmentShaderPixelInterlock = VK_FALSE; + pExtInfo->fragmentShaderShadingRateInterlock = VK_FALSE; + } + + structSize = sizeof(*pExtInfo); + + break; + } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_FEATURES_EXT: { auto* pExtInfo = reinterpret_cast(pHeader); @@ -6642,6 +6704,20 @@ size_t PhysicalDevice::GetFeatures2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR: + { + auto* pExtInfo = reinterpret_cast(pHeader); + + if (updateFeatures) + { + pExtInfo->cooperativeMatrix = VK_TRUE; + pExtInfo->cooperativeMatrixRobustBufferAccess = VK_TRUE; + } + + structSize = sizeof(*pExtInfo); + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GRAPHICS_PIPELINE_LIBRARY_FEATURES_EXT: { auto* pExtInfo = reinterpret_cast(pHeader); @@ -6662,8 +6738,8 @@ size_t PhysicalDevice::GetFeatures2( if (updateFeatures) { pExtInfo->primitivesGeneratedQuery = VK_TRUE; - pExtInfo->primitivesGeneratedQueryWithRasterizerDiscard = VK_FALSE; - pExtInfo->primitivesGeneratedQueryWithNonZeroStreams = VK_FALSE; + pExtInfo->primitivesGeneratedQueryWithRasterizerDiscard = VK_TRUE; + pExtInfo->primitivesGeneratedQueryWithNonZeroStreams = VK_TRUE; } structSize = sizeof(*pExtInfo); @@ -6796,6 +6872,19 @@ size_t PhysicalDevice::GetFeatures2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_5_FEATURES_KHR: + { + auto* pExtInfo = reinterpret_cast(pHeader); + + if (updateFeatures) + { + pExtInfo->maintenance5 = VK_TRUE; + } + + structSize = sizeof(*pExtInfo); + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PAGEABLE_DEVICE_LOCAL_MEMORY_FEATURES_EXT: { auto* pExtInfo = reinterpret_cast(pHeader); @@ -7901,6 +7990,13 @@ void PhysicalDevice::GetDeviceProperties2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_KHR: + { + auto* pProps = static_cast(pNext); + pProps->cooperativeMatrixSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GRAPHICS_PIPELINE_LIBRARY_PROPERTIES_EXT: { if (IsExtensionSupported(DeviceExtensions::EXT_GRAPHICS_PIPELINE_LIBRARY)) @@ -7920,6 +8016,19 @@ void PhysicalDevice::GetDeviceProperties2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_5_PROPERTIES_KHR: + { + auto* pProps = static_cast(pNext); + + pProps->earlyFragmentMultisampleCoverageAfterSampleCounting = VK_TRUE; + pProps->earlyFragmentSampleMaskTestBeforeSampleCounting = VK_TRUE; + pProps->depthStencilSwizzleOneSupport = VK_TRUE; + pProps->polygonModePointSize = VK_TRUE; + pProps->nonStrictSinglePixelWideLinesUseParallelogram = VK_TRUE; + pProps->nonStrictWideLinesUseParallelogram = VK_TRUE; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_PROPERTIES_EXT: { auto* pProps = static_cast(pNext); @@ -9293,6 +9402,57 @@ VkResult PhysicalDevice::GetFragmentShadingRates( return (*pFragmentShadingRateCount < numberOfSupportedShaderRates) ? VK_INCOMPLETE : VK_SUCCESS; } +// ===================================================================================================================== +// Retrieve KHR cooperative matrix properties. Called in response to vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR +VkResult PhysicalDevice::GetPhysicalDeviceCooperativeMatrixPropertiesKHR( + uint32_t* pPropertyCount, + VkCooperativeMatrixPropertiesKHR* pProperties) +{ + VkResult result = VK_SUCCESS; + + if (IsKhrCooperativeMatrixSupported(this)) + { + constexpr uint32_t totalCount = CooperativeMatrixTypesCount + CooperativeMatrixSaturatingTypesCount; + + if (pProperties == nullptr) + { + *pPropertyCount = totalCount; + } + else + { + if (*pPropertyCount < totalCount) + { + result = VK_INCOMPLETE; + } + + *pPropertyCount = Util::Min(*pPropertyCount, totalCount); + + for (uint32_t i = 0; i < *pPropertyCount; ++i) + { + const bool sat = (i >= CooperativeMatrixTypesCount); + const uint32_t n = sat ? i - CooperativeMatrixTypesCount : i; + const CooperativeMatrixType* types = sat ? CooperativeMatrixSaturatingTypes : CooperativeMatrixTypes; + + pProperties[i].MSize = CooperativeMatrixDimension; + pProperties[i].NSize = CooperativeMatrixDimension; + pProperties[i].KSize = CooperativeMatrixDimension; + pProperties[i].AType = types[n].a; + pProperties[i].BType = types[n].b; + pProperties[i].CType = types[n].c; + pProperties[i].ResultType = types[n].c; + pProperties[i].scope = VK_SCOPE_SUBGROUP_KHR; + pProperties[i].saturatingAccumulation = sat ? VK_TRUE : VK_FALSE; + } + } + } + else + { + *pPropertyCount = 0; + } + + return result; +} + // C-style entry points namespace entry { @@ -9930,6 +10090,17 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceFragmentShadingRatesKHR( pFragmentShadingRates); } +// ===================================================================================================================== +VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR( + VkPhysicalDevice physicalDevice, + uint32_t* pPropertyCount, + VkCooperativeMatrixPropertiesKHR* pProperties) +{ + return ApiPhysicalDevice::ObjectFromHandle(physicalDevice)->GetPhysicalDeviceCooperativeMatrixPropertiesKHR( + pPropertyCount, + pProperties); +} + } } diff --git a/icd/api/vk_pipeline.cpp b/icd/api/vk_pipeline.cpp index 006fb0c3..6696ea15 100644 --- a/icd/api/vk_pipeline.cpp +++ b/icd/api/vk_pipeline.cpp @@ -102,12 +102,12 @@ static_assert(VK_ARRAY_SIZE(HwStageNames) == static_cast(Util::Abi::Ha static constexpr uint32_t ExecutableStatisticsCount = 5; // ===================================================================================================================== -// Filter VkPipelineCreateFlags to only values used for pipeline caching -PipelineCreateFlags Pipeline::GetCacheIdControlFlags( - PipelineCreateFlags in) +// Filter VkPipelineCreateFlags2KHR to only values used for pipeline caching +VkPipelineCreateFlags2KHR Pipeline::GetCacheIdControlFlags( + VkPipelineCreateFlags2KHR in) { // The following flags should NOT affect cache computation - static constexpr PipelineCreateFlags CacheIdIgnoreFlags = { 0 + static constexpr VkPipelineCreateFlags2KHR CacheIdIgnoreFlags = { 0 | VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR | VK_PIPELINE_CREATE_DERIVATIVE_BIT | VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT @@ -838,11 +838,12 @@ void Pipeline::ElfHashToCacheId( // Extensions and features whose enablement affects compiler inputs (and hence the binary) hasher.Update(pDevice->IsExtensionEnabled(DeviceExtensions::AMD_SHADER_INFO)); hasher.Update(pDevice->IsExtensionEnabled(DeviceExtensions::EXT_PRIMITIVES_GENERATED_QUERY)); - hasher.Update(pDevice->IsExtensionEnabled(DeviceExtensions::EXT_TRANSFORM_FEEDBACK)); { + hasher.Update(pDevice->IsExtensionEnabled(DeviceExtensions::EXT_TRANSFORM_FEEDBACK)); hasher.Update(pDevice->IsExtensionEnabled(DeviceExtensions::EXT_SCALAR_BLOCK_LAYOUT)); hasher.Update(pDevice->GetEnabledFeatures().scalarBlockLayout); } + hasher.Update(pDevice->GetEnabledFeatures().robustBufferAccess); hasher.Update(pDevice->GetEnabledFeatures().robustBufferAccessExtended); hasher.Update(pDevice->GetEnabledFeatures().robustImageAccessExtended); diff --git a/icd/api/vk_pipeline_layout.cpp b/icd/api/vk_pipeline_layout.cpp index 83c0b584..74427f80 100644 --- a/icd/api/vk_pipeline_layout.cpp +++ b/icd/api/vk_pipeline_layout.cpp @@ -248,6 +248,14 @@ bool PipelineLayout::HasRayTracing( } #endif +// ===================================================================================================================== +bool PipelineLayout::ReserveXfbNode( + const Device* pDevice) +{ + return pDevice->IsExtensionEnabled(DeviceExtensions::EXT_TRANSFORM_FEEDBACK) + ; +} + // ===================================================================================================================== VkResult PipelineLayout::BuildCompactSchemeInfo( const Device* pDevice, @@ -303,7 +311,7 @@ VkResult PipelineLayout::BuildCompactSchemeInfo( uint32_t gfxReservedCount = 0; // Reserve an user-data to store the VA of buffer for transform feedback. - if (pDevice->IsExtensionEnabled(DeviceExtensions::EXT_TRANSFORM_FEEDBACK)) + if (ReserveXfbNode(pDevice)) { gfxReservedCount++; } @@ -354,7 +362,7 @@ VkResult PipelineLayout::BuildCompactSchemeInfo( } // Reserve an user-data to store the VA of buffer for transform feedback. - if (pDevice->IsExtensionEnabled(DeviceExtensions::EXT_TRANSFORM_FEEDBACK)) + if (ReserveXfbNode(pDevice)) { pUserDataLayout->transformFeedbackRegBase = pInfo->userDataRegCount; pUserDataLayout->transformFeedbackRegCount = 1; @@ -610,7 +618,7 @@ VkResult PipelineLayout::BuildIndirectSchemeInfo( pInfo->userDataRegCount += 1; // Allocate user data for transform feedback buffer - if (pDevice->IsExtensionEnabled(DeviceExtensions::EXT_TRANSFORM_FEEDBACK)) + if (ReserveXfbNode(pDevice)) { pUserDataLayout->transformFeedbackRegBase = pInfo->userDataRegCount; pPipelineInfo->numUserDataNodes += 1; @@ -1466,7 +1474,8 @@ void PipelineLayout::BuildIndirectSchemeLlpcPipelineMapping( const auto& userDataLayout = m_info.userDataLayout.indirect; const bool uberFetchShaderEnabled = IsUberFetchShaderEnabled(m_pDevice); - const bool transformFeedbackEnabled = m_pDevice->IsExtensionEnabled(DeviceExtensions::EXT_TRANSFORM_FEEDBACK); + const bool transformFeedbackEnabled = ReserveXfbNode(m_pDevice); + const bool threadGroupReversalEnabled = userDataLayout.threadGroupReversalRegBase != InvalidReg; #if VKI_RAY_TRACING const bool rayTracingEnabled = m_pipelineInfo.hasRayTracing; diff --git a/icd/api/vk_query.cpp b/icd/api/vk_query.cpp index 4f29ea61..c0ef7b1c 100644 --- a/icd/api/vk_query.cpp +++ b/icd/api/vk_query.cpp @@ -103,8 +103,8 @@ VkResult PalQueryPool::Create( else if ((pCreateInfo->queryType == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT) || (pCreateInfo->queryType == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT)) { - queryType = Pal::QueryType::PipelineStats; - createInfo.queryPoolType = Pal::QueryPoolType::PipelineStats; + queryType = Pal::QueryType::StreamoutStats; + createInfo.queryPoolType = Pal::QueryPoolType::StreamoutStats; } if (VK_ENUM_IN_RANGE(pCreateInfo->queryType, VK_QUERY_TYPE)) @@ -117,11 +117,6 @@ VkResult PalQueryPool::Create( VkQueryPipelineStatisticFlags enabledStats = pCreateInfo->pipelineStatistics; - if (pCreateInfo->queryType == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT) - { - enabledStats |= VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT; - } - createInfo.enabledStats = VkToPalQueryPipelineStatsFlags(enabledStats); if (pCreateInfo->queryType == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT) @@ -277,13 +272,15 @@ VkResult PalQueryPool::GetResults( // HW will returns two 64-bits integers for the query of transform feedback, they are written primitives and the // number of needed primitives. And if the flag VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, an extra integer // which indicates the availability state needs to be written. - const uint32_t numXfbQueryDataElems = availability ? 3 : 2; + const uint32_t numCounters = (m_queryType == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT) ? 1 : 2; + const uint32_t numXfbQueryDataElems = availability ? (numCounters + 1) : numCounters; // Vulkan supports 32-bit unsigned integer values data of transform feedback query, but Pal supports 64-bit only. // So the query data is stored into pXfbQueryData first. uint64_t* pXfbQueryData = nullptr; - if (m_queryType == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT) + if ((m_queryType == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT) || + (m_queryType == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT)) { queryDataStride = sizeof(uint64_t) * numXfbQueryDataElems; queryDataSize = queryDataStride * queryCount; @@ -300,8 +297,14 @@ VkResult PalQueryPool::GetResults( if (result == VK_SUCCESS) { + Pal::QueryResultFlags palFlags = VkToPalQueryResultFlags(queryFlags); + if (m_queryType == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT) + { + palFlags = static_cast( + static_cast(palFlags) | static_cast(Pal::QueryResultOnlyPrimNeeded)); + } Pal::Result palResult = m_pPalQueryPool[DefaultDeviceIndex]->GetResults( - VkToPalQueryResultFlags(queryFlags), + palFlags, m_palQueryType, startQuery, queryCount, @@ -313,7 +316,8 @@ VkResult PalQueryPool::GetResults( result = PalToVkResult(palResult); } - if ((m_queryType == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT) && + if (((m_queryType == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT) || + (m_queryType == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT)) && ((result == VK_SUCCESS) || (result == VK_NOT_READY))) { stride = (stride == 0) ? queryDataStride : stride; @@ -327,32 +331,68 @@ VkResult PalQueryPool::GetResults( { uint32_t* pPrimitivesCount = static_cast(pData); - if ((result == VK_SUCCESS) || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) + switch (m_queryType) { - pPrimitivesCount[0] = static_cast(pXfbQueryElem[1]); - pPrimitivesCount[1] = static_cast(pXfbQueryElem[0]); + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + if ((result == VK_SUCCESS) || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) + { + pPrimitivesCount[0] = static_cast(pXfbQueryElem[1]); + pPrimitivesCount[1] = static_cast(pXfbQueryElem[0]); + } + pPrimitivesCount += 2; + pXfbQueryElem += 2; + break; + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: + if ((result == VK_SUCCESS) || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) + { + pPrimitivesCount[0] = static_cast(pXfbQueryElem[0]); + } + pPrimitivesCount++; + pXfbQueryElem++; + break; + default: + VK_NEVER_CALLED(); + break; } if (availability) { // Set the availability state to the last slot. - pPrimitivesCount[2] = static_cast(pXfbQueryElem[2]); + *pPrimitivesCount = static_cast(*pXfbQueryElem); } } else { uint64_t* pPrimitivesCount = static_cast(pData); - if ((result == VK_SUCCESS) || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) + switch (m_queryType) { - pPrimitivesCount[0] = pXfbQueryElem[1]; - pPrimitivesCount[1] = pXfbQueryElem[0]; + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + if ((result == VK_SUCCESS) || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) + { + pPrimitivesCount[0] = pXfbQueryElem[1]; + pPrimitivesCount[1] = pXfbQueryElem[0]; + } + pPrimitivesCount += 2; + pXfbQueryElem += 2; + break; + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: + if ((result == VK_SUCCESS) || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) + { + pPrimitivesCount[0] = pXfbQueryElem[0]; + } + pPrimitivesCount++; + pXfbQueryElem++; + break; + default: + VK_NEVER_CALLED(); + break; } if (availability) { // Set the availability state to the last slot. - pPrimitivesCount[2] = pXfbQueryElem[2]; + *pPrimitivesCount = *pXfbQueryElem; } } diff --git a/icd/api/vk_queue.cpp b/icd/api/vk_queue.cpp index cf4e768d..c7614fd7 100644 --- a/icd/api/vk_queue.cpp +++ b/icd/api/vk_queue.cpp @@ -1553,7 +1553,7 @@ VkResult Queue::Submit( } } - DebugPrintf::PostQueueSubmit(m_pDevice, pCmdBuffers, cmdBufferCount); + DebugPrintf::PostQueueSubmit(m_pDevice, this, pCmdBuffers, cmdBufferCount); #if VKI_RAY_TRACING #endif diff --git a/icd/api/vk_shader.cpp b/icd/api/vk_shader.cpp index 60fd3fc2..e32efe15 100644 --- a/icd/api/vk_shader.cpp +++ b/icd/api/vk_shader.cpp @@ -153,10 +153,13 @@ void* ShaderModule::GetFirstValidShaderData(const ShaderModuleHandle* pHandle) // ===================================================================================================================== ShaderModule::ShaderModule( size_t codeSize, - const void* pCode) + const void* pCode, + VkShaderModuleCreateFlags flags) + : + m_codeSize(codeSize), + m_pCode(pCode), + m_flags(flags) { - m_codeSize = codeSize; - m_pCode = pCode; m_codeHash = BuildCodeHash(pCode, codeSize); memset(&m_handle, 0, sizeof(m_handle)); @@ -182,10 +185,10 @@ VkResult ShaderModule::Create( memcpy(pCode, pCreateInfo->pCode, pCreateInfo->codeSize); - VK_PLACEMENT_NEW(pMemory) ShaderModule(pCreateInfo->codeSize, pCode); + VK_PLACEMENT_NEW(pMemory) ShaderModule(pCreateInfo->codeSize, pCode, pCreateInfo->flags); ShaderModule* pShaderModuleObj = static_cast(pMemory); - VkResult vkResult = pShaderModuleObj->Init(pDevice, pCreateInfo->flags); + VkResult vkResult = pShaderModuleObj->Init(pDevice); VK_ASSERT(vkResult == VK_SUCCESS); *pShaderModule = ShaderModule::HandleFromVoidPointer(pMemory); @@ -196,16 +199,25 @@ VkResult ShaderModule::Create( // ===================================================================================================================== // Initialize shader module object, performing SPIR-V to AMD IL shader binary conversion. VkResult ShaderModule::Init( - Device* pDevice, - VkShaderModuleCreateFlags flags) + Device* pDevice) { PipelineCompiler* pCompiler = pDevice->GetCompiler(DefaultDeviceIndex); + const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); + Vkgc::BinaryData shaderBinary = {}; shaderBinary.pCode = m_pCode; shaderBinary.codeSize = m_codeSize; VkResult result = pCompiler->BuildShaderModule( - pDevice, flags, 0, shaderBinary, false, false, nullptr, nullptr, &m_handle); + pDevice, + m_flags, + 0, + shaderBinary, + false, + false, + nullptr, + nullptr, + &m_handle); if (result == VK_SUCCESS) { diff --git a/icd/api/vk_swapchain.cpp b/icd/api/vk_swapchain.cpp index 8a682ead..a9724851 100644 --- a/icd/api/vk_swapchain.cpp +++ b/icd/api/vk_swapchain.cpp @@ -814,9 +814,7 @@ void SwapChain::PostPresent( { if (m_pFullscreenMgr != nullptr) { - m_pFullscreenMgr->PostPresent(this, presentInfo, pPresentResult); - } m_appOwnedImageCount--; @@ -900,9 +898,7 @@ Pal::IGpuMemory* SwapChain::UpdatePresentInfo( // information in case it has enabled fullscreen. if (m_pFullscreenMgr != nullptr) { - m_pFullscreenMgr->UpdatePresentInfo(this, pPresentInfo, flipFlags); - } return pSrcImageGpuMemory; diff --git a/icd/res/ver.h b/icd/res/ver.h index a0a414cf..dd788be2 100644 --- a/icd/res/ver.h +++ b/icd/res/ver.h @@ -36,7 +36,7 @@ #define VERSION_MAJOR_STR MAKE_VERSION_STRING(VULKAN_ICD_MAJOR_VERSION) "\0" // Bump up after each promotion to mainline -#define VULKAN_ICD_BUILD_VERSION 288 +#define VULKAN_ICD_BUILD_VERSION 291 // String version is needed with leading zeros and extra termination (unicode) #define VERSION_NUMBER_MINOR VULKAN_ICD_BUILD_VERSION diff --git a/icd/settings/settings.cpp b/icd/settings/settings.cpp index 3f815833..cf7ce6a7 100644 --- a/icd/settings/settings.cpp +++ b/icd/settings/settings.cpp @@ -214,6 +214,18 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( { m_settings.maxUnifiedNonRayGenShaders = static_cast(atoi(pMaxInlinedShadersEnvVar)); } +#if VKI_BUILD_GFX11 + // Default optimized RT settings for Navi31 / 32, + // which has physical VGPR 1536 per SIMD + if (pInfo->gfxipProperties.shaderCore.vgprsPerSimd == 1536) + { + // 1.2% faster - Corresponds to 1.5x VGPR feature + m_settings.rtIndirectVgprLimit = 120; + + // 1% faster using indirectCallTargetOccupancyPerSimd of 0.75 + m_settings.indirectCallTargetOccupancyPerSimd = 0.75; + } +#endif #endif if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp10_3) @@ -229,6 +241,10 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( { m_settings.disableImplicitInvariantExports = false; + +#if VKI_BUILD_GFX11 + m_settings.optimizeTessFactor = true; +#endif } #if VKI_BUILD_GFX11 @@ -249,6 +265,7 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( if (pInfo->gpuMemoryProperties.barSize > (7ull * _1GB)) { if ((appProfile != AppProfile::WorldWarZ) + && (appProfile != AppProfile::XPlane) ) { m_settings.cmdAllocatorDataHeap = Pal::GpuHeapLocal; @@ -778,7 +795,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( // Force exclusive sharing mode - 2% gain m_settings.forceImageSharingMode = ForceImageSharingMode::ForceImageSharingModeExclusive; - m_settings.delayFullScreenAcquireToFirstPresent = true; m_settings.implicitExternalSynchronization = false; if (pInfo->gfxLevel >= Pal::GfxIpLevel::GfxIp10_3) @@ -818,8 +834,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( } m_settings.implicitExternalSynchronization = false; - - m_settings.syncOsHdrState = false; } if (appProfile == AppProfile::BaldursGate3) @@ -865,6 +879,10 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( } #if VKI_BUILD_GFX11 + if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) + { + m_settings.forcePwsMode = PwsMode::NoLateAcquirePoint; + } #endif } @@ -877,14 +895,10 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( { m_settings.disableDisplayDcc = DisplayableDcc::DisplayableDccDisabled; - ////////////////////////////////////////////// - // Ray Tracing Settings - m_settings.rtEnableBuildParallel = true; - - m_settings.rtEnableUpdateParallel = true; - m_settings.rtEnableTriangleSplitting = true; + m_settings.rtTriangleCompressionMode = NoTriangleCompression; + m_settings.useFlipHint = false; m_settings.forceEnableDcc = (ForceDccFor2DShaderStorage | @@ -978,10 +992,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( if (pInfo->gfxLevel >= Pal::GfxIpLevel::GfxIp10_3) { #if VKI_RAY_TRACING - // Ray Tracing Settings - m_settings.rtEnableBuildParallel = true; - m_settings.rtEnableUpdateParallel = true; - m_settings.rtBvhBuildModeFastTrace = BvhBuildModeLinear; m_settings.rtEnableTopDownBuild = false; m_settings.plocRadius = 4; @@ -1271,8 +1281,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.enableAceShaderPrefetch = false; #if VKI_RAY_TRACING - m_settings.rtEnableBuildParallel = true; - m_settings.rtEnableUpdateParallel = true; m_settings.plocRadius = 4; m_settings.rtBvhBuildModeFastTrace = BvhBuildModeLinear; m_settings.rtEnableTopDownBuild = false; @@ -1282,18 +1290,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.asyncComputeQueueMaxWavesPerCu = 20; m_settings.csWaveSize = 64; } -#if VKI_BUILD_GFX11 - else if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp11_0) - { - m_settings.rtIndirectVgprLimit = 120; -#if VKI_BUILD_NAVI31 - if (pInfo->revision == Pal::AsicRevision::Navi31) - { - m_settings.indirectCallTargetOccupancyPerSimd = 0.75; - } -#endif - } -#endif #endif } @@ -1501,8 +1497,11 @@ void VulkanSettingsLoader::ValidateSettings() m_settings.bvhBuildModeOverrideTLAS = buildMode; } -#if VKI_RAY_TRACING -#endif + // Compression is not compatible with collapse or triangle splitting. + if (m_settings.rtEnableBVHCollapse || m_settings.rtEnableTriangleSplitting) + { + m_settings.rtTriangleCompressionMode = NoTriangleCompression; + } // Clamp target occupancy to [0.0, 1.0] m_settings.indirectCallTargetOccupancyPerSimd = @@ -1570,7 +1569,9 @@ void VulkanSettingsLoader::UpdatePalSettings() // For vulkan driver, forceDepthClampBasedOnZExport should be false by default, this is required to pass // depth_range_unrestricted CTS tests. Set it to true for applications that have perf drops pPalSettings->depthClampBasedOnZExport = m_settings.forceDepthClampBasedOnZExport; + pPalSettings->cpDmaCmdCopyMemoryMaxBytes = m_settings.cpDmaCmdCopyMemoryMaxBytes; + pPalSettings->cmdBufBatchedSubmitChainLimit = m_settings.cmdBufBatchedSubmitChainLimit; // The color cache fetch size is limited to 256Bytes MAX regardless of other register settings. pPalSettings->limitCbFetch256B = m_settings.limitCbFetch256B; diff --git a/icd/settings/settings_xgl.json b/icd/settings/settings_xgl.json index b99fd6ab..e8e20a73 100644 --- a/icd/settings/settings_xgl.json +++ b/icd/settings/settings_xgl.json @@ -1119,30 +1119,6 @@ "Scope": "Driver", "Type": "bool" }, - { - "Name": "DelayFullScreenAcquireToFirstPresent", - "Description": "When true, delays acquiring exclusive full screen to the first present. May be needed in cases where the OS state gets messed up because of early acquires by the application", - "Tags": [ - "Present" - ], - "Defaults": { - "Default": false - }, - "Scope": "Driver", - "Type": "bool" - }, - { - "Name": "SyncOsHdrState", - "Description": "When true, create a dummy FP16 swapchain to sync the OS HDR state and avoid dim desktop issues during Alt+Tab and app exit from HDR modes", - "Tags": [ - "Present" - ], - "Defaults": { - "Default": true - }, - "Scope": "Driver", - "Type": "bool" - }, { "Name": "ForceMinImageCount", "Description": "A non-zero value will force to set the minimum count of swap chain images", @@ -1302,7 +1278,7 @@ "SPIRV Options" ], "Defaults": { - "Default": false + "Default": true }, "Scope": "Driver", "Type": "bool" @@ -2197,7 +2173,7 @@ "SPIRV Options" ], "Defaults": { - "Default": true + "Default": false }, "Scope": "Driver", "Type": "bool", @@ -3016,7 +2992,7 @@ "VKI_RAY_TRACING" ], "Defaults": { - "Default": "BoxSortingClosest" + "Default": "BoxSortingLargestFirstAndClosestMidpoint" }, "ValidValues": { "IsEnum": true, @@ -3325,7 +3301,7 @@ "VKI_RAY_TRACING" ], "Defaults": { - "Default": "TriangleCompressionAutoModeDisabled" + "Default": "TriangleCompressionAutoModeFastTraceOrCompaction" }, "ValidValues": { "IsEnum": true, @@ -3605,7 +3581,7 @@ "VKI_RAY_TRACING" ], "Defaults": { - "Default": "NoTriangleCompression" + "Default": "AutoTriangleCompression" }, "ValidValues": { "IsEnum": true, @@ -7005,6 +6981,18 @@ "Type": "uint32", "Name": "CpDmaCmdCopyMemoryMaxBytes" }, + { + "Description": "Limits the number of command buffers that will be chained together; reduce to prevent problems due to long running submits or to minimize the scope of commands being debugged.", + "Tags": [ + "Debugging" + ], + "Defaults": { + "Default": 128 + }, + "Scope": "Driver", + "Type": "uint32", + "Name": "CmdBufBatchedSubmitChainLimit" + }, { "Description": "Disables all implicit invariant marking of exports, which in turn disables MUL/ADD -> FMA. This option is legal but may cause issues if applications are sensitive to FMA influencing some export results.", "Tags": [