diff --git a/cmake/XglCompileDefinitions.cmake b/cmake/XglCompileDefinitions.cmake index f015f753..8ddc2268 100644 --- a/cmake/XglCompileDefinitions.cmake +++ b/cmake/XglCompileDefinitions.cmake @@ -83,6 +83,10 @@ macro(xgl_set_compile_definitions) target_compile_definitions(xgl PRIVATE VKI_BUILD_PHOENIX1=1) endif() + if(XGL_BUILD_PHOENIX2) + target_compile_definitions(xgl PRIVATE VKI_BUILD_PHOENIX2=1) + endif() + if(XGL_BUILD_REMBRANDT) target_compile_definitions(xgl PRIVATE VKI_BUILD_REMBRANDT=1) endif() diff --git a/cmake/XglOptions.cmake b/cmake/XglOptions.cmake index 7fb621aa..9fd74fbf 100644 --- a/cmake/XglOptions.cmake +++ b/cmake/XglOptions.cmake @@ -63,6 +63,8 @@ macro(xgl_options) option(XGL_BUILD_PHOENIX1 "Build vulkan for PHOENIX1" ON) + option(XGL_BUILD_PHOENIX2 "Build vulkan for PHOENIX2" ON) + option(XGL_BUILD_TESTS "Build tests?" OFF) option(XGL_BUILD_TOOLS "Build tools?" OFF) diff --git a/cmake/XglOverrides.cmake b/cmake/XglOverrides.cmake index d0b4a87f..70bddd86 100644 --- a/cmake/XglOverrides.cmake +++ b/cmake/XglOverrides.cmake @@ -107,6 +107,8 @@ macro(xgl_overrides_pal) set(PAL_BUILD_GFX11 1 CACHE BOOL "${PROJECT_NAME} override." FORCE) endif() + set(PAL_BUILD_PHOENIX2 ${XGL_BUILD_PHOENIX2} CACHE BOOL "${PROJECT_NAME} override." FORCE) + # Wayland set(PAL_BUILD_WAYLAND ${BUILD_WAYLAND_SUPPORT} CACHE BOOL "Build PAL with Wayland support" FORCE) @@ -157,6 +159,8 @@ macro(xgl_overrides_vkgc) set(LLPC_BUILD_PHOENIX1 ${XGL_BUILD_PHOENIX1} CACHE BOOL "${PROJECT_NAME} override." FORCE) + set(LLPC_BUILD_PHOENIX2 ${XGL_BUILD_PHOENIX2} CACHE BOOL "${PROJECT_NAME} override." FORCE) + set(LLPC_ENABLE_WERROR ${ICD_ANALYSIS_WARNINGS_AS_ERRORS} CACHE BOOL "${PROJECT_NAME} override." FORCE) endmacro() diff --git a/cmake/XglVersions.cmake b/cmake/XglVersions.cmake index ef0908bd..02029eeb 100644 --- a/cmake/XglVersions.cmake +++ b/cmake/XglVersions.cmake @@ -28,7 +28,7 @@ include_guard() # This will become the value of PAL_CLIENT_INTERFACE_MAJOR_VERSION. It describes the version of the PAL interface # that the ICD supports. PAL uses this value to enable backwards-compatibility for older interface versions. # It must be updated on each PAL promotion after handling all of the interface changes described in palLib.h. -set(ICD_PAL_CLIENT_MAJOR_VERSION "856") +set(ICD_PAL_CLIENT_MAJOR_VERSION "867") # This will become the value of GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION if ICD_GPUOPEN_DEVMODE_BUILD=1. # It describes the interface version of the gpuopen shared module (part of PAL) that the ICD supports. @@ -42,4 +42,4 @@ set(ICD_GPURT_CLIENT_MAJOR_VERSION "46") # This will become the value of LLPC_CLIENT_INTERFACE_MAJOR_VERSION if ICD_BUILD_LLPC=1. # It describes the version of the interface version of LLPC that the ICD supports. -set(ICD_LLPC_CLIENT_MAJOR_VERSION "70") +set(ICD_LLPC_CLIENT_MAJOR_VERSION "71") diff --git a/icd/CMakeLists.txt b/icd/CMakeLists.txt index e9d1d557..2da6c892 100644 --- a/icd/CMakeLists.txt +++ b/icd/CMakeLists.txt @@ -150,6 +150,7 @@ target_sources(xgl PRIVATE api/vk_gpa_session.cpp api/vk_descriptor_update_template.cpp api/vk_utils.cpp + api/vk_indirect_commands_layout.cpp api/appopt/barrier_filter_layer.cpp api/appopt/strange_brigade_layer.cpp api/appopt/g_shader_profile.cpp @@ -302,6 +303,34 @@ target_sources(xgl PRIVATE settings/settings_xgl.json ) +add_custom_command( + OUTPUT ${ICD_SETTINGS_DIR}/g_experiments.cpp ${ICD_SETTINGS_DIR}/g_experiments.h + COMMAND ${PYTHON_CMD} ${ICD_GEN_SETTINGS} + -i ${ICD_SETTINGS_DIR}/experiments_settings_xgl.json + -o ${ICD_SETTINGS_DIR} + -g experiments + -s settings/experimentsLoader.h + --namespaces vk + --settings-struct-name ExpSettings + --classname ExperimentsLoader + DEPENDS ${ICD_GEN_SETTINGS_FILES} ${ICD_SETTINGS_DIR}/experiments_settings_xgl.json + COMMENT "Generating Vulkan settings code from experiments_settings_xgl.json" +) + +add_custom_target( + RunVKExperimentsGenerator + DEPENDS ${ICD_GEN_SETTINGS_FILES} ${ICD_SETTINGS_DIR}/experiments_settings_xgl.json + COMMENT "Checking if re-generation is required for settings" +) + +add_dependencies(xgl RunVKExperimentsGenerator) + +target_sources(xgl PRIVATE + settings/g_experiments.cpp + settings/experimentsLoader.cpp + settings/experiments_settings_xgl.json +) + ### ICD api/sqtt ############################################################## target_sources(xgl PRIVATE api/sqtt/sqtt_layer.cpp @@ -311,7 +340,10 @@ target_sources(xgl PRIVATE ### ICD api/devmode ########################################################### if(ICD_GPUOPEN_DEVMODE_BUILD) - target_sources(xgl PRIVATE api/devmode/devmode_mgr.cpp) + target_sources(xgl PRIVATE + api/devmode/devmode_rgp.cpp + api/devmode/devmode_ubertrace.cpp + ) endif() ### ICD layer ################################################################## diff --git a/icd/Loader/LunarG/Lnx/amd-icd.json b/icd/Loader/LunarG/Lnx/amd-icd.json index 7e32c74a..c6aed269 100644 --- a/icd/Loader/LunarG/Lnx/amd-icd.json +++ b/icd/Loader/LunarG/Lnx/amd-icd.json @@ -2,13 +2,13 @@ "file_format_version": "1.0.0", "ICD": { "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so", - "api_version": "1.3.279" + "api_version": "1.3.280" }, "layer": { "name": "VK_LAYER_AMD_switchable_graphics_@ISABITS@", "type": "GLOBAL", "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so", - "api_version": "1.3.279", + "api_version": "1.3.280", "implementation_version": "1", "description": "AMD switchable graphics layer", "functions": { diff --git a/icd/api/app_profile.cpp b/icd/api/app_profile.cpp index 17df92c0..cf466325 100644 --- a/icd/api/app_profile.cpp +++ b/icd/api/app_profile.cpp @@ -40,6 +40,7 @@ #if defined(__unix__) #include +#include #include #endif @@ -731,18 +732,6 @@ constexpr AppProfilePatternEntry AppEngineQuanticDream "quantic dream engine" }; -constexpr AppProfilePatternEntry AppNameEnshrouded = -{ - PatternAppNameLower, - "enshrouded" -}; - -constexpr AppProfilePatternEntry AppEngineHolistic = -{ - PatternEngineNameLower, - "holistic" -}; - constexpr AppProfilePatternEntry PatternEnd = {}; // This is a table of patterns. The first matching pattern in this table will be returned. @@ -1477,23 +1466,6 @@ AppProfilePattern AppPatternTable[] = } }, - { - AppProfile::Enshrouded, - { - AppNameEnshrouded, - AppEngineHolistic, - PatternEnd - } - }, - - { - AppProfile::HolisticEngine, - { - AppEngineHolistic, - PatternEnd - } - }, - { AppProfile::Zink, { @@ -1702,24 +1674,14 @@ static char* GetExecutableName( size_t* pLength, bool includeExtension) // true if you want the extension on the file name. { - pid_t pid = getpid(); char* pExecutable = nullptr; - char* pModuleFileName = nullptr; char path[PATH_MAX] = {0}; - char commandStringBuffer[PATH_MAX] = {0}; - sprintf(commandStringBuffer, "cat /proc/%d/cmdline", pid); - FILE* pCommand = popen(commandStringBuffer, "r"); - if (pCommand != nullptr) + pExecutable = static_cast(malloc(PATH_MAX)); + + if (pExecutable != nullptr) { - if (fgets(path, PATH_MAX, pCommand) != nullptr) - { - pExecutable = static_cast(malloc(PATH_MAX)); - pModuleFileName = strrchr(path, '/') ? strrchr(path, '/') + 1 : path; - pModuleFileName = strrchr(pModuleFileName, '\\') ? strrchr(pModuleFileName, '\\') + 1 : pModuleFileName; - strcpy(pExecutable, pModuleFileName); - *pLength = strlen(pExecutable); - } - pclose(pCommand); + utils::GetExecutableNameAndPath(pExecutable, &path[0]); + *pLength = strlen(pExecutable); } return pExecutable; } diff --git a/icd/api/app_shader_optimizer.cpp b/icd/api/app_shader_optimizer.cpp index 6988ec7e..475b230f 100644 --- a/icd/api/app_shader_optimizer.cpp +++ b/icd/api/app_shader_optimizer.cpp @@ -592,14 +592,20 @@ void ShaderOptimizer::ApplyProfileToDynamicComputeShaderInfo( } // ===================================================================================================================== -void ShaderOptimizer::ApplyProfileToDynamicGraphicsShaderInfo( +bool ShaderOptimizer::ApplyProfileToDynamicGraphicsShaderInfo( const ShaderProfileAction& action, Pal::DynamicGraphicsShaderInfo* pGraphicsShaderInfo) const { + bool hasUpdate = false; + if (action.dynamicShaderInfo.apply.maxWavesPerCu) { pGraphicsShaderInfo->maxWavesPerCu = static_cast(action.dynamicShaderInfo.maxWavesPerCu); + + hasUpdate = true; } + + return hasUpdate; } // ===================================================================================================================== @@ -631,25 +637,32 @@ void ShaderOptimizer::ApplyProfileToGraphicsPipelineCreateInfo( switch (vkgcStage) { case ShaderStage::ShaderStageTask: - ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->ts); + pGraphicsShaderInfos->enable.ts |= + ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->ts); break; case ShaderStage::ShaderStageVertex: - ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->vs); + pGraphicsShaderInfos->enable.vs |= + ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->vs); break; case ShaderStage::ShaderStageTessControl: - ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->hs); + pGraphicsShaderInfos->enable.hs |= + ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->hs); break; case ShaderStage::ShaderStageTessEval: - ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->ds); + pGraphicsShaderInfos->enable.ds |= + ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->ds); break; case ShaderStage::ShaderStageGeometry: - ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->gs); + pGraphicsShaderInfos->enable.gs |= + ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->gs); break; case ShaderStage::ShaderStageMesh: - ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->ms); + pGraphicsShaderInfos->enable.ms |= + ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->ms); break; case ShaderStage::ShaderStageFragment: - ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->ps); + pGraphicsShaderInfos->enable.ps |= + ApplyProfileToDynamicGraphicsShaderInfo(shaders[vkgcStage], &pGraphicsShaderInfos->ps); break; default: PAL_ASSERT_ALWAYS(); diff --git a/icd/api/cmd_buffer_ring.cpp b/icd/api/cmd_buffer_ring.cpp index 63a96f0a..40bf1556 100644 --- a/icd/api/cmd_buffer_ring.cpp +++ b/icd/api/cmd_buffer_ring.cpp @@ -188,7 +188,7 @@ void CmdBufferRing::DestroyCmdBufState( // Wait to finish in case still in flight if (pCmdBufState->pFence->GetStatus() == Pal::Result::NotReady) { - pDevice->PalDevice(deviceIdx)->WaitForFences(1, &pCmdBufState->pFence, true, ~0ULL); + pDevice->PalDevice(deviceIdx)->WaitForFences(1, &pCmdBufState->pFence, true, std::chrono::nanoseconds::max()); } // Destroy Fence diff --git a/icd/api/compiler_solution.cpp b/icd/api/compiler_solution.cpp index d20068c3..3942a4a1 100644 --- a/icd/api/compiler_solution.cpp +++ b/icd/api/compiler_solution.cpp @@ -206,7 +206,7 @@ void CompilerSolution::StoreShaderBinaryToCache( if (updateBinaryCache || updateAppCache || (pCacheBinary->pCode == nullptr)) { - if ((pHeader->binaryLength > 0) && (pCacheBinary->codeSize == 0)) + if (((pHeader->binaryLength > 0) || (pHeader->requireFullPipeline)) && (pCacheBinary->codeSize == 0)) { size_t cacheSize = sizeof(ShaderLibraryBlobHeader) + pHeader->binaryLength + pHeader->fragMetaLength; @@ -218,7 +218,10 @@ void CompilerSolution::StoreShaderBinaryToCache( if (pBuffer != nullptr) { memcpy(pBuffer, pHeader, sizeof(ShaderLibraryBlobHeader)); - memcpy(Util::VoidPtrInc(pBuffer, sizeof(ShaderLibraryBlobHeader)), pBlob, pHeader->binaryLength); + if (pBlob != nullptr) + { + memcpy(Util::VoidPtrInc(pBuffer, sizeof(ShaderLibraryBlobHeader)), pBlob, pHeader->binaryLength); + } if (pFragmentMeta != nullptr) { memcpy(Util::VoidPtrInc(pBuffer, sizeof(ShaderLibraryBlobHeader) + pHeader->binaryLength), diff --git a/icd/api/compiler_solution_llpc.cpp b/icd/api/compiler_solution_llpc.cpp index 5cb39cc5..15be0b29 100644 --- a/icd/api/compiler_solution_llpc.cpp +++ b/icd/api/compiler_solution_llpc.cpp @@ -507,7 +507,22 @@ VkResult CompilerSolutionLlpc::CreateGraphicsShaderBinary( } } - if (hitCache == false) + bool checkShaderModuleIdUsage = false; + if (hitCache) + { + const auto* pShaderLibraryHeader = + reinterpret_cast(shaderLibraryBinary.pCode); + if (pShaderLibraryHeader->requireFullPipeline) + { + checkShaderModuleIdUsage = true; + } + } + else + { + checkShaderModuleIdUsage = true; + } + + if (checkShaderModuleIdUsage) { for (uint32_t stage = 0; stage < ShaderStage::ShaderStageGfxCount; stage++) { @@ -552,10 +567,14 @@ VkResult CompilerSolutionLlpc::CreateGraphicsShaderBinary( if (llpcResult == Vkgc::Result::Success) { - blobHeader.binaryLength = finalBinary.codeSize; + blobHeader.binaryLength = finalBinary.codeSize; blobHeader.fragMetaLength = pipelineOut.fsOutputMetaDataSize; } - else if (llpcResult != Vkgc::Result::RequireFullPipeline) + else if (llpcResult == Vkgc::Result::RequireFullPipeline) + { + blobHeader.requireFullPipeline = true; + } + else { result = (llpcResult == Vkgc::Result::ErrorOutOfMemory) ? VK_ERROR_OUT_OF_HOST_MEMORY : VK_ERROR_INITIALIZATION_FAILED; @@ -567,18 +586,15 @@ VkResult CompilerSolutionLlpc::CreateGraphicsShaderBinary( // Always call StoreShaderBinaryToCache to sync data between app cache and binary cache except // RequireFullPipeline. When cache is hit, blobHeader is zero, StoreShaderBinaryToCache will ignore // finalBinary, and reuse shaderLibraryBinary. - if ((finalBinary.pCode != nullptr) || (shaderLibraryBinary.pCode != nullptr)) - { - StoreShaderBinaryToCache( - pPipelineCache, - &cacheId, - &blobHeader, - finalBinary.pCode, - pipelineOut.fsOutputMetaData, - hitCache, - hitAppCache, - &shaderLibraryBinary); - } + StoreShaderBinaryToCache( + pPipelineCache, + &cacheId, + &blobHeader, + finalBinary.pCode, + pipelineOut.fsOutputMetaData, + hitCache, + hitAppCache, + &shaderLibraryBinary); pModuleState->elfPackage = shaderLibraryBinary; pModuleState->pFsOutputMetaData = nullptr; @@ -830,7 +846,7 @@ void LlpcHelperThreadProvider::WaitForTasks() { while (m_pDeferredWorkload->completedInstances < m_pDeferredWorkload->totalInstances) { - m_pDeferredWorkload->event.Wait(1.0f); + m_pDeferredWorkload->event.Wait(Util::fseconds { 1.0f }); } } @@ -1229,8 +1245,11 @@ Vkgc::BinaryData CompilerSolutionLlpc::ExtractPalElfBinary( { Vkgc::BinaryData elfBinary = {}; const ShaderLibraryBlobHeader* pHeader = reinterpret_cast(shaderBinary.pCode); - elfBinary.pCode = pHeader + 1; - elfBinary.codeSize = pHeader->binaryLength; + if (pHeader->binaryLength > 0) + { + elfBinary.pCode = pHeader + 1; + elfBinary.codeSize = pHeader->binaryLength; + } return elfBinary; } diff --git a/icd/api/debug_printf.cpp b/icd/api/debug_printf.cpp index 1517112a..6fd013ac 100644 --- a/icd/api/debug_printf.cpp +++ b/icd/api/debug_printf.cpp @@ -214,7 +214,7 @@ Pal::Result DebugPrintf::PostQueueProcess( while (true) { palResult = pDevice->PalDevice(DefaultDeviceIndex)->WaitForSemaphores( - 1, palSemaphores, waitValues, 0, 1000000llu); + 1, palSemaphores, waitValues, 0, std::chrono::nanoseconds {1000000llu}); decodeOffset = ProcessDebugPrintfBuffer(pDevice, deviceIdx, decodeOffset, &file); if ((PalToVkResult(palResult) <= 0) || (loopIndex++ > 1000)) diff --git a/icd/api/devmode/devmode_mgr.h b/icd/api/devmode/devmode_mgr.h index 0a99d53e..61d7ce80 100644 --- a/icd/api/devmode/devmode_mgr.h +++ b/icd/api/devmode/devmode_mgr.h @@ -25,7 +25,7 @@ /** *********************************************************************************************************************** * @file devmode_mgr.h -* @brief Contains the GPU Open Developer Mode manager (DevModeMgr) +* @brief Contains the GPU Open Developer Mode interface (IDevMode) *********************************************************************************************************************** */ @@ -39,60 +39,12 @@ #include "include/vk_device.h" // PAL headers -#include "palHashMap.h" #include "palQueue.h" -#include "palUtil.h" -#include "palList.h" -#include "palVector.h" - -// gpuutil headers -#include "gpuUtil/palGpaSession.h" -#if ICD_GPUOPEN_DEVMODE_BUILD -// gpuopen headers -#include "gpuopen.h" - -#endif - -// PAL forward declarations -namespace Pal -{ -class ICmdBuffer; -class IFence; -class IQueueSemaphore; -struct PalPublicSettings; -} - -// GPUOpen forward declarations -namespace DevDriver -{ -class DevDriverServer; -class PipelineUriService; -class IMsgChannel; -struct MessageBuffer; - -namespace DriverControlProtocol -{ -enum struct DeviceClockMode : uint32_t; -class HandlerServer; -} - -namespace SettingsProtocol -{ -class HandlerServer; -struct Setting; -} - -namespace RGPProtocol -{ -class RGPServer; -} -} // Vulkan forward declarations namespace vk { class Instance; -class Queue; class Pipeline; #if VKI_RAY_TRACING class RayTracingPipeline; @@ -107,26 +59,18 @@ namespace vk // ===================================================================================================================== // This class provides functionality to interact with the GPU Open Developer Mode message passing service and the rest // of the driver. -class DevModeMgr +class IDevMode { #if ICD_GPUOPEN_DEVMODE_BUILD public: - // Number of frames to wait before collecting a hardware trace. - // Note: This will be replaced in the future by a remotely configurable value provided by the RGP server. - static constexpr uint32_t NumTracePreparationFrames = 4; - // Pipeline hash used for instruction tracing whenever no pipeline is being targetted. static constexpr uint64_t InvalidTargetPipelineHash = 0; - ~DevModeMgr(); - - static VkResult Create(Instance* pInstance, DevModeMgr** ppObject); - - void Finalize( + virtual void Finalize( uint32_t deviceCount, - VulkanSettingsLoader* settingsLoaders[]); + VulkanSettingsLoader* settingsLoaders[]) = 0; - void Destroy(); + virtual void Destroy() = 0; enum class FrameDelimiterType : uint32_t { @@ -136,280 +80,61 @@ class DevModeMgr Count }; - void NotifyFrameBegin(const Queue* pQueue, FrameDelimiterType delimiterType); - void NotifyFrameEnd(const Queue* pQueue, FrameDelimiterType delimiterType); - void WaitForDriverResume(); - void PipelineCreated(Device* pDevice, Pipeline* pPipeline); - void PipelineDestroyed(Device* pDevice, Pipeline* pPipeline); + virtual void NotifyFrameBegin(const Queue* pQueue, FrameDelimiterType delimiterType) = 0; + virtual void NotifyFrameEnd(const Queue* pQueue, FrameDelimiterType delimiterType) = 0; + virtual void WaitForDriverResume() = 0; + virtual void PipelineCreated(Device* pDevice, Pipeline* pPipeline) = 0; + virtual void PipelineDestroyed(Device* pDevice, Pipeline* pPipeline) = 0; #if VKI_RAY_TRACING - void ShaderLibrariesCreated(Device* pDevice, RayTracingPipeline* pPipeline); - void ShaderLibrariesDestroyed(Device* pDevice, RayTracingPipeline* pPipeline); + virtual void ShaderLibrariesCreated(Device* pDevice, RayTracingPipeline* pPipeline) = 0; + virtual void ShaderLibrariesDestroyed(Device* pDevice, RayTracingPipeline* pPipeline) = 0; #endif - void PostDeviceCreate(Device* pDevice); - void PreDeviceDestroy(Device* pDevice); - void NotifyPreSubmit(); + virtual void PostDeviceCreate(Device* pDevice) = 0; + virtual void PreDeviceDestroy(Device* pDevice) = 0; + virtual void NotifyPreSubmit() = 0; - uint64_t GetInstructionTraceTargetHash(); - void StartInstructionTrace(CmdBuffer* pCmdBuffer); - void StopInstructionTrace(CmdBuffer* pCmdBuffer); + virtual uint64_t GetInstructionTraceTargetHash() = 0; + virtual void StartInstructionTrace(CmdBuffer* pCmdBuffer) = 0; + virtual void StopInstructionTrace(CmdBuffer* pCmdBuffer) = 0; - bool IsTracingEnabled() const; - bool IsCrashAnalysisEnabled() const { return m_crashAnalysisEnabled; } + virtual bool IsTracingEnabled() const = 0; + virtual bool IsCrashAnalysisEnabled() const = 0; - Pal::Result TimedQueueSubmit( + virtual Pal::Result TimedQueueSubmit( uint32_t deviceIdx, Queue* pQueue, uint32_t cmdBufferCount, const VkCommandBuffer* pCommandBuffers, const Pal::SubmitInfo& submitInfo, - VirtualStackFrame* pVirtStackFrame); + VirtualStackFrame* pVirtStackFrame) = 0; - Pal::Result TimedSignalQueueSemaphore( + virtual Pal::Result TimedSignalQueueSemaphore( uint32_t deviceIdx, Queue* pQueue, VkSemaphore semaphore, uint64_t value, - Pal::IQueueSemaphore* pQueueSemaphore); + Pal::IQueueSemaphore* pQueueSemaphore) = 0; - Pal::Result TimedWaitQueueSemaphore( + virtual Pal::Result TimedWaitQueueSemaphore( uint32_t deviceIdx, Queue* pQueue, VkSemaphore semaphore, uint64_t value, - Pal::IQueueSemaphore* pQueueSemaphore); + Pal::IQueueSemaphore* pQueueSemaphore) = 0; - inline bool IsQueueTimingActive(const Device* pDevice) const; - inline bool GetTraceFrameBeginTag(uint64_t* pTag) const; - inline bool GetTraceFrameEndTag(uint64_t* pTag) const; + virtual bool IsQueueTimingActive(const Device* pDevice) const = 0; + virtual bool GetTraceFrameBeginTag(uint64_t* pTag) const = 0; + virtual bool GetTraceFrameEndTag(uint64_t* pTag) const = 0; - Util::Result RegisterPipelineCache( + virtual Util::Result RegisterPipelineCache( PipelineBinaryCache* pPipelineCache, - uint32_t postSizeLimit); - - void DeregisterPipelineCache( - PipelineBinaryCache* pPipelineCache); - Util::ListIterator GetPipelineCacheListIterator() - { return m_pipelineCaches.Begin(); } - - Util::RWLock* GetPipelineReinjectionLock() - { return &m_pipelineReinjectionLock; } - -private: - // Steps that an RGP trace goes through - enum class TraceStatus : uint32_t - { - // "Pre-trace" stages: - Idle = 0, // No active trace and none requested - Pending, // We've identified that a trace has been requested and we've received its parameters, - // but we have not yet seen the first frame. - Preparing, // A trace has been requested but is not active yet because we are - // currently sampling timing information over some number of lead frames. - Running, // SQTT and queue timing is currently active for all command buffer submits. - - // "Post-trace" stages: - WaitingForSqtt, // Command to turn off SQTT has been submitted and we're waiting for fence confirmation. - Ending // Tracing is no longer active, but all results are not yet ready. - }; - - // Various trigger modes supported for RGP traces - enum class TriggerMode : uint32_t - { - Present = 0, // Traces triggered by presents - Index, // Traces triggered by frame indices - Tag // Traces triggered by command buffer tags - }; - - // Queue family (type)-specific state to support RGP tracing (part of device state) - struct TraceQueueFamilyState - { - uint32_t queueFamilyIndex; - Pal::QueueType queueType; - Pal::EngineType engineType; - Pal::ICmdBuffer* pTraceBeginCmdBuf; - Pal::ICmdBuffer* pTraceBeginSqttCmdBuf; - Pal::ICmdBuffer* pTraceEndSqttCmdBuf; - Pal::ICmdBuffer* pTraceEndCmdBuf; - Pal::ICmdBuffer* pTraceFlushCmdBuf; - bool supportsTracing; - bool usedForBegin; - bool usedForEndSqtt; - bool usedForEnd; - }; - - // Queue-specific resources to support RGP tracing (part of device state) - struct TraceQueueState - { - const Queue* pQueue; - TraceQueueFamilyState* pFamily; - Pal::uint64 queueId; - Pal::uint64 queueContext; - bool timingSupported; - }; - - static constexpr uint32_t MaxTraceQueueFamilies = Queue::MaxQueueFamilies; - static constexpr uint32_t MaxTraceQueues = MaxTraceQueueFamilies * Queue::MaxQueuesPerFamily; - - // All per-device state to support RGP tracing - struct TraceState - { - TraceStatus status; // Current trace status (idle, running, etc.) - bool labelDelimsPresent; // True is a label delimiter is recieved - - Device* pDevice; // The device currently doing the tracing - Pal::ICmdAllocator* pCmdAllocator; // Command allocator for creating trace-begin/end buffers - Pal::IFence* pBeginFence; // Fence that is signaled when a trace-begin cmdbuf retires - Pal::IFence* pEndSqttFence; // Fence that is signaled when a trace-end cmdbuf retires - Pal::IFence* pEndFence; // Fence that is signaled when a trace-end cmdbuf retires - TraceQueueState* pTracePrepareQueue; // The queue that triggered the full start of a trace - TraceQueueState* pTraceBeginQueue; // The queue that triggered starting SQTT - TraceQueueState* pTraceEndSqttQueue; // The queue that triggered ending SQTT - TraceQueueState* pTraceEndQueue; // The queue that triggered the full end of a trace - - GpuUtil::GpaSession* pGpaSession; // GPA session helper object for building RGP data - uint32_t gpaSampleId; // Sample ID associated with the current trace - bool queueTimingEnabled; // Queue timing is enabled - bool flushAllQueues; // Flushes all queues during the last preparation frame. - - // Queue-specific state/information for tracing: - uint32_t queueCount; - TraceQueueState queueState[MaxTraceQueues]; - uint32_t auxQueueCount; - TraceQueueState auxQueueStates[MaxTraceQueues]; // Used for queues belonging to other logical devices - // pointing to the same physical device - uint32_t queueFamilyCount; - TraceQueueFamilyState queueFamilyState[MaxTraceQueueFamilies]; - - uint32_t activeCmdBufCount; // Number of command buffers in below list - Pal::ICmdBuffer* pActiveCmdBufs[4]; // List of command buffers that need to be reset at end of trace - uint32_t preparedFrameCount; // Number of frames counted while preparing for a trace - uint32_t sqttFrameCount; // Number of frames counted while SQTT tracing is active - uint64_t frameBeginTag; // If a command buffer with this debug-tag is submitted, it is - // treated as a virtual frame-start event. - uint64_t frameEndTag; // Similarly to above but for frame-end post-submit. - }; - - DevModeMgr(Instance* pInstance); - - Pal::Result Init(); - - void AdvanceActiveTraceStep(TraceState* pState, const Queue* pQueue, bool beginFrame, FrameDelimiterType delimiterType); - void TraceIdleToPendingStep(TraceState* pState); - Pal::Result TracePendingToPreparingStep(TraceState* pState, const Queue* pQueue, FrameDelimiterType delimiterType); - Pal::Result TracePreparingToRunningStep(TraceState* pState, const Queue* pQueue); - Pal::Result TraceRunningToWaitingForSqttStep(TraceState* pState, const Queue* pQueue); - Pal::Result TraceWaitingForSqttToEndingStep(TraceState* pState, const Queue* pQueue); - Pal::Result TraceEndingToIdleStep(TraceState* pState); - void FinishOrAbortTrace(TraceState* pState, bool aborted); - - Pal::Result CheckTraceDeviceChanged(TraceState* pState, Device* pNewDevice); - void DestroyRGPTracing(TraceState* pState); - Pal::Result InitRGPTracing(TraceState* pState, Device* pDevice); - Pal::Result InitTraceQueueResources(TraceState* pState, bool* pHasDebugVmid, const Queue* pQueue, bool auxQueue); - Pal::Result InitTraceQueueResourcesForDevice(TraceState* pState, bool* pHasDebugVmid); - Pal::Result InitTraceQueueFamilyResources(TraceState* pTraceState, TraceQueueFamilyState* pFamilyState); - void DestroyTraceQueueFamilyResources(TraceQueueFamilyState* pState); - TraceQueueState* FindTraceQueueState(TraceState* pState, const Queue* pQueue); - bool QueueSupportsTiming(uint32_t deviceIdx, const Queue* pQueue); + uint32_t postSizeLimit) = 0; - Instance* m_pInstance; - DevDriver::DevDriverServer* m_pDevDriverServer; - DevDriver::RGPProtocol::RGPServer* m_pRGPServer; - DevDriver::PipelineUriService* m_pPipelineUriService; - Util::Mutex m_traceMutex; - TraceState m_trace; - bool m_finalized; - TriggerMode m_triggerMode; // Current trigger mode for RGP frame trace - uint32_t m_numPrepFrames; - uint32_t m_traceGpuMemLimit; - bool m_enableInstTracing; // Enable instruction-level SQTT tokens - bool m_enableSampleUpdates; - bool m_allowComputePresents; - bool m_blockingTraceEnd; // Wait on trace-end fences immediately. - uint32_t m_globalFrameIndex; - uint64_t m_traceFrameBeginTag; - uint64_t m_traceFrameEndTag; - uint32_t m_traceFrameBeginIndex; - uint32_t m_traceFrameEndIndex; - uint64_t m_targetApiPsoHash; - uint32_t m_seMask; // Shader engine mask - bool m_perfCountersEnabled; // True if perf counters are enabled - uint64_t m_perfCounterMemLimit; // Memory limit for perf counters - uint32_t m_perfCounterFrequency; // Counter sample frequency - bool m_useStaticVmid; - bool m_staticVmidActive; - bool m_crashAnalysisEnabled; - - using PerfCounterList = Util::Vector; - - PerfCounterList m_perfCounterIds; // List of perf counter ids - - using PipelineCacheList = Util::List; - - PipelineCacheList m_pipelineCaches; - Util::RWLock m_pipelineReinjectionLock; - - PAL_DISALLOW_DEFAULT_CTOR(DevModeMgr); - PAL_DISALLOW_COPY_AND_ASSIGN(DevModeMgr); + virtual void DeregisterPipelineCache( + PipelineBinaryCache* pPipelineCache) = 0; #endif }; -#if ICD_GPUOPEN_DEVMODE_BUILD -// ===================================================================================================================== -// Returns true if queue operations are currently being timed by RGP traces. -inline bool DevModeMgr::IsQueueTimingActive( - const Device* pDevice - ) const -{ - return (m_trace.queueTimingEnabled && - (m_trace.status == TraceStatus::Running || - m_trace.status == TraceStatus::Preparing || - m_trace.status == TraceStatus::WaitingForSqtt) && - (pDevice->VkPhysicalDevice(DefaultDeviceIndex) == m_trace.pDevice->VkPhysicalDevice(DefaultDeviceIndex))); } -// ===================================================================================================================== -bool DevModeMgr::GetTraceFrameBeginTag( - uint64_t* pTag - ) const -{ - bool active; - - if (m_trace.status != TraceStatus::Idle) - { - *pTag = m_traceFrameBeginTag; - - active = true; - } - else - { - active = false; - } - - return active; -} - -// ===================================================================================================================== -bool DevModeMgr::GetTraceFrameEndTag( - uint64_t* pTag - ) const -{ - bool active; - - if (m_trace.status != TraceStatus::Idle) - { - *pTag = m_traceFrameEndTag; - - active = true; - } - else - { - active = false; - } - - return active; -} - -#endif -}; - #endif /* __DEVMODE_DEVMODE_MGR_H__ */ diff --git a/icd/api/devmode/devmode_mgr.cpp b/icd/api/devmode/devmode_rgp.cpp similarity index 95% rename from icd/api/devmode/devmode_mgr.cpp rename to icd/api/devmode/devmode_rgp.cpp index ee35ea3b..1006a647 100644 --- a/icd/api/devmode/devmode_mgr.cpp +++ b/icd/api/devmode/devmode_rgp.cpp @@ -24,14 +24,14 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file devmode_mgr.cpp - * @brief Contains implementation of the GPU Open Developer Mode manager + * @file devmode_rgp.cpp + * @brief Contains RGP implementation of the GPU Open Developer Mode manager *********************************************************************************************************************** */ #if ICD_GPUOPEN_DEVMODE_BUILD // Vulkan headers -#include "devmode/devmode_mgr.h" +#include "devmode/devmode_rgp.h" #include "include/vk_cmdbuffer.h" #include "include/vk_instance.h" #include "include/vk_pipeline.h" @@ -69,8 +69,6 @@ namespace vk { -constexpr uint64_t InfiniteTimeout = static_cast(1e10); - // ===================================================================================================================== // Translates a DevDriver result to a VkResult. static VkResult DevDriverToVkResult( @@ -136,13 +134,13 @@ static DevDriver::Result GetPipelineHashes( void* pUserData, DevDriver::ExclusionFlags /*flags*/) { - DevModeMgr* pDevModeMgr = static_cast(pUserData); + DevModeRgp* pDevModeRgp = static_cast(pUserData); DevDriver::Result result = DevDriver::Result::NotReady; - Util::RWLockAuto cacheListLock(pDevModeMgr->GetPipelineReinjectionLock()); + Util::RWLockAuto cacheListLock(pDevModeRgp->GetPipelineReinjectionLock()); - auto pipelineCacheIter = pDevModeMgr->GetPipelineCacheListIterator(); + auto pipelineCacheIter = pDevModeRgp->GetPipelineCacheListIterator(); while (pipelineCacheIter.Get() != nullptr) { @@ -185,13 +183,13 @@ static DevDriver::Result GetPipelineCodeObjects( const DevDriver::PipelineHash* pPipelineHashes, size_t numHashes) { - DevModeMgr* pDevModeMgr = static_cast(pUserData); + DevModeRgp* pDevModeRgp = static_cast(pUserData); DevDriver::Result result = DevDriver::Result::NotReady; - Util::RWLockAuto cacheListLock(pDevModeMgr->GetPipelineReinjectionLock()); + Util::RWLockAuto cacheListLock(pDevModeRgp->GetPipelineReinjectionLock()); - auto pipelineCacheIter = pDevModeMgr->GetPipelineCacheListIterator(); + auto pipelineCacheIter = pDevModeRgp->GetPipelineCacheListIterator(); while (pipelineCacheIter.Get() != nullptr) { @@ -266,16 +264,16 @@ static DevDriver::Result InjectPipelineCodeObjects( void* pUserData, DevDriver::PipelineRecordsIterator& pipelineIter) { - DevModeMgr* pDevModeMgr = static_cast(pUserData); + DevModeRgp* pDevModeRgp = static_cast(pUserData); DevDriver::Result result = DevDriver::Result::NotReady; uint32_t replacedCount = 0u; DevDriver::PipelineRecord record; - Util::RWLockAuto cacheListLock(pDevModeMgr->GetPipelineReinjectionLock()); + Util::RWLockAuto cacheListLock(pDevModeRgp->GetPipelineReinjectionLock()); - auto pipelineCacheIter = pDevModeMgr->GetPipelineCacheListIterator(); + auto pipelineCacheIter = pDevModeRgp->GetPipelineCacheListIterator(); while (pipelineCacheIter.Get() != nullptr) { @@ -312,7 +310,8 @@ static DevDriver::Result InjectPipelineCodeObjects( } // ===================================================================================================================== -DevModeMgr::DevModeMgr(Instance* pInstance) +DevModeRgp::DevModeRgp( + Instance* pInstance) : m_pInstance(pInstance), m_pDevDriverServer(pInstance->PalPlatform()->GetDevDriverServer()), @@ -344,24 +343,24 @@ DevModeMgr::DevModeMgr(Instance* pInstance) } // ===================================================================================================================== -DevModeMgr::~DevModeMgr() +DevModeRgp::~DevModeRgp() { DestroyRGPTracing(&m_trace); } // ===================================================================================================================== // Creates the GPU Open Developer Mode manager class. -VkResult DevModeMgr::Create( +VkResult DevModeRgp::Create( Instance* pInstance, - DevModeMgr** ppObject) + DevModeRgp** ppObject) { Pal::Result result = Pal::Result::Success; - void* pStorage = pInstance->AllocMem(sizeof(DevModeMgr), VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + void* pStorage = pInstance->AllocMem(sizeof(DevModeRgp), VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); if (pStorage != nullptr) { - DevModeMgr* pMgr = VK_PLACEMENT_NEW(pStorage) DevModeMgr(pInstance); + DevModeRgp* pMgr = VK_PLACEMENT_NEW(pStorage) DevModeRgp(pInstance); result = pMgr->Init(); @@ -384,7 +383,7 @@ VkResult DevModeMgr::Create( // ===================================================================================================================== // Initializes the devmode manager based on the current client flags. -Pal::Result DevModeMgr::Init() +Pal::Result DevModeRgp::Init() { Pal::Result result = Pal::Result::Success; @@ -400,7 +399,7 @@ Pal::Result DevModeMgr::Init() // Called during initial device enumeration prior to calling Pal::IDevice::CommitSettingsAndInit(). // // This finalizes the developer driver manager. -void DevModeMgr::Finalize( +void DevModeRgp::Finalize( uint32_t deviceCount, VulkanSettingsLoader* settingsLoaders[]) { @@ -437,7 +436,7 @@ void DevModeMgr::Finalize( // ===================================================================================================================== // Destroy the developer mode manager -void DevModeMgr::Destroy() +void DevModeRgp::Destroy() { Util::Destructor(this); @@ -446,7 +445,7 @@ void DevModeMgr::Destroy() // ===================================================================================================================== // Waits for the driver to be resumed if it's currently paused. -void DevModeMgr::WaitForDriverResume() +void DevModeRgp::WaitForDriverResume() { auto* pDriverControlServer = m_pDevDriverServer->GetDriverControlServer(); @@ -458,7 +457,7 @@ void DevModeMgr::WaitForDriverResume() // Called to notify of a frame-end boundary and is used to coordinate RGP trace start/stop. // // "delimiterType" represents how the transition/notify was triggered. -void DevModeMgr::NotifyFrameEnd( +void DevModeRgp::NotifyFrameEnd( const Queue* pQueue, FrameDelimiterType delimiterType) { @@ -509,7 +508,7 @@ void DevModeMgr::NotifyFrameEnd( } // ===================================================================================================================== -void DevModeMgr::AdvanceActiveTraceStep( +void DevModeRgp::AdvanceActiveTraceStep( TraceState* pState, const Queue* pQueue, bool beginFrame, @@ -588,7 +587,7 @@ void DevModeMgr::AdvanceActiveTraceStep( // Checks if all trace results are ready and finalizes the results, transmitting data through gpuopen. // // Transitions from Ending to Idle step. -Pal::Result DevModeMgr::TraceEndingToIdleStep(TraceState* pState) +Pal::Result DevModeRgp::TraceEndingToIdleStep(TraceState* pState) { VK_ASSERT(pState->status == TraceStatus::Ending); @@ -596,7 +595,8 @@ Pal::Result DevModeMgr::TraceEndingToIdleStep(TraceState* pState) if (m_blockingTraceEnd) { - result = pState->pDevice->PalDevice(DefaultDeviceIndex)->WaitForFences(1, &pState->pEndFence, true, InfiniteTimeout); + result = pState->pDevice->PalDevice(DefaultDeviceIndex)->WaitForFences( + 1, &pState->pEndFence, true, std::chrono::nanoseconds::max()); if (result != Pal::Result::Success) { @@ -677,7 +677,7 @@ Pal::Result DevModeMgr::TraceEndingToIdleStep(TraceState* pState) // Notifies of a frame-begin boundary and is used to coordinate RGP trace start/stop. // // "delimiterType" represents how the transition/notify was triggered. -void DevModeMgr::NotifyFrameBegin( +void DevModeRgp::NotifyFrameBegin( const Queue* pQueue, FrameDelimiterType delimiterType) { @@ -720,7 +720,7 @@ void DevModeMgr::NotifyFrameBegin( // ===================================================================================================================== // Returns the queue state for this aparticular queue. -DevModeMgr::TraceQueueState* DevModeMgr::FindTraceQueueState( +DevModeRgp::TraceQueueState* DevModeRgp::FindTraceQueueState( TraceState* pState, const Queue* pQueue) { @@ -758,7 +758,7 @@ DevModeMgr::TraceQueueState* DevModeMgr::FindTraceQueueState( // ===================================================================================================================== // Called from tracing layer before any queue submits any work. -void DevModeMgr::NotifyPreSubmit() +void DevModeRgp::NotifyPreSubmit() { // Check for pending traces here. TraceIdleToPendingStep(&m_trace); @@ -769,7 +769,7 @@ void DevModeMgr::NotifyPreSubmit() // each command buffer submit by the tracing layer and should be very light-weight. // // This function moves the trace state from Idle to Pending. -void DevModeMgr::TraceIdleToPendingStep( +void DevModeRgp::TraceIdleToPendingStep( TraceState* pState) { // Double-checked lock to test if there is a trace pending. If so, extract its trace parameters. @@ -910,7 +910,7 @@ void DevModeMgr::TraceIdleToPendingStep( // "delimiterType" represents how the transition/notify was triggered. // // This function transitions from the Pending state to the Preparing state. -Pal::Result DevModeMgr::TracePendingToPreparingStep( +Pal::Result DevModeRgp::TracePendingToPreparingStep( TraceState* pState, const Queue* pQueue, FrameDelimiterType delimiterType) @@ -1208,7 +1208,7 @@ Pal::Result DevModeMgr::TracePendingToPreparingStep( // information command buffer which starts SQ thread tracing (SQTT). // // This function transitions from the Preparing state to the Running state. -Pal::Result DevModeMgr::TracePreparingToRunningStep( +Pal::Result DevModeRgp::TracePreparingToRunningStep( TraceState* pState, const Queue* pQueue) { @@ -1336,7 +1336,7 @@ Pal::Result DevModeMgr::TracePreparingToRunningStep( // This function submits the command buffer to stop SQTT tracing. Full tracing still continues. // // This function transitions from the Running state to the WaitingForSqtt state. -Pal::Result DevModeMgr::TraceRunningToWaitingForSqttStep( +Pal::Result DevModeRgp::TraceRunningToWaitingForSqttStep( TraceState* pState, const Queue* pQueue) { @@ -1446,7 +1446,7 @@ Pal::Result DevModeMgr::TraceRunningToWaitingForSqttStep( // This function ends a running RGP trace. // // This function transitions from the WaitingForSqtt state to WaitingForResults state. -Pal::Result DevModeMgr::TraceWaitingForSqttToEndingStep( +Pal::Result DevModeRgp::TraceWaitingForSqttToEndingStep( TraceState* pState, const Queue* pQueue) { @@ -1457,7 +1457,8 @@ Pal::Result DevModeMgr::TraceWaitingForSqttToEndingStep( if (fenceResult == Pal::Result::NotReady && m_blockingTraceEnd) { - fenceResult = pState->pDevice->PalDevice(DefaultDeviceIndex)->WaitForFences(1, &pState->pEndSqttFence, true, InfiniteTimeout); + fenceResult = pState->pDevice->PalDevice(DefaultDeviceIndex)->WaitForFences( + 1, &pState->pEndSqttFence, true, std::chrono::nanoseconds::max()); } // Return without advancing if not ready yet or submit failed @@ -1556,7 +1557,7 @@ Pal::Result DevModeMgr::TraceWaitingForSqttToEndingStep( // ===================================================================================================================== // This function resets and possibly cancels a currently active (between begin/end) RGP trace. It frees any dependent // resources. -void DevModeMgr::FinishOrAbortTrace( +void DevModeRgp::FinishOrAbortTrace( TraceState* pState, bool aborted) { @@ -1600,7 +1601,7 @@ void DevModeMgr::FinishOrAbortTrace( // ===================================================================================================================== // This function will reinitialize RGP tracing resources that are reused between traces if the new trace device // has changed since the last trace. -Pal::Result DevModeMgr::CheckTraceDeviceChanged( +Pal::Result DevModeRgp::CheckTraceDeviceChanged( TraceState* pState, Device* pNewDevice) { @@ -1630,7 +1631,8 @@ Pal::Result DevModeMgr::CheckTraceDeviceChanged( // ===================================================================================================================== // Destroys device-persistent RGP resources for a particular queue family -void DevModeMgr::DestroyTraceQueueFamilyResources(TraceQueueFamilyState* pState) +void DevModeRgp::DestroyTraceQueueFamilyResources( + TraceQueueFamilyState* pState) { if (pState->pTraceBeginCmdBuf != nullptr) { @@ -1670,7 +1672,8 @@ void DevModeMgr::DestroyTraceQueueFamilyResources(TraceQueueFamilyState* pState) // ===================================================================================================================== // Destroys device-persistent RGP resources -void DevModeMgr::DestroyRGPTracing(TraceState* pState) +void DevModeRgp::DestroyRGPTracing( + TraceState* pState) { if (pState->status != TraceStatus::Idle) { @@ -1725,7 +1728,7 @@ void DevModeMgr::DestroyRGPTracing(TraceState* pState) // // If "auxQueue" is true, then the queue provided does not belong to the tracing logical device, but belongs to the // same physical device (and thus, the same PAL device) -Pal::Result DevModeMgr::InitTraceQueueResources( +Pal::Result DevModeRgp::InitTraceQueueResources( TraceState* pState, bool* pHasDebugVmid, const Queue* pQueue, @@ -1831,7 +1834,7 @@ Pal::Result DevModeMgr::InitTraceQueueResources( // ===================================================================================================================== // This function finds out all the queues in the device that we have to synchronize for RGP-traced frames and // initializes resources for them. -Pal::Result DevModeMgr::InitTraceQueueResourcesForDevice( +Pal::Result DevModeRgp::InitTraceQueueResourcesForDevice( TraceState* pState, bool* pHasDebugVmid) { @@ -1868,7 +1871,7 @@ Pal::Result DevModeMgr::InitTraceQueueResourcesForDevice( // ===================================================================================================================== // This function initializes the queue family -specific resources to support RGP tracing for a particular queue family -Pal::Result DevModeMgr::InitTraceQueueFamilyResources( +Pal::Result DevModeRgp::InitTraceQueueFamilyResources( TraceState* pTraceState, TraceQueueFamilyState* pFamilyState) { @@ -2037,7 +2040,7 @@ Pal::Result DevModeMgr::InitTraceQueueFamilyResources( // ===================================================================================================================== // Initializes device-persistent RGP resources -Pal::Result DevModeMgr::InitRGPTracing( +Pal::Result DevModeRgp::InitRGPTracing( TraceState* pState, Device* pDevice) { @@ -2250,7 +2253,7 @@ Pal::Result DevModeMgr::InitRGPTracing( // ===================================================================================================================== // Called when a new device is created. This will preallocate reusable RGP trace resources for that device. -void DevModeMgr::PostDeviceCreate(Device* pDevice) +void DevModeRgp::PostDeviceCreate(Device* pDevice) { Util::MutexAuto lock(&m_traceMutex); @@ -2273,7 +2276,8 @@ void DevModeMgr::PostDeviceCreate(Device* pDevice) // ===================================================================================================================== // Called prior to a device's being destroyed. This will free persistent RGP trace resources for that device. -void DevModeMgr::PreDeviceDestroy(Device* pDevice) +void DevModeRgp::PreDeviceDestroy( + Device* pDevice) { Util::MutexAuto lock(&m_traceMutex); @@ -2285,7 +2289,7 @@ void DevModeMgr::PreDeviceDestroy(Device* pDevice) } // ===================================================================================================================== -bool DevModeMgr::QueueSupportsTiming( +bool DevModeRgp::QueueSupportsTiming( uint32_t deviceIdx, const Queue* pQueue) { @@ -2311,7 +2315,7 @@ bool DevModeMgr::QueueSupportsTiming( } // ===================================================================================================================== -Pal::Result DevModeMgr::TimedSignalQueueSemaphore( +Pal::Result DevModeRgp::TimedSignalQueueSemaphore( uint32_t deviceIdx, Queue* pQueue, VkSemaphore semaphore, @@ -2341,7 +2345,7 @@ Pal::Result DevModeMgr::TimedSignalQueueSemaphore( } // ===================================================================================================================== -Pal::Result DevModeMgr::TimedWaitQueueSemaphore( +Pal::Result DevModeRgp::TimedWaitQueueSemaphore( uint32_t deviceIdx, Queue* pQueue, VkSemaphore semaphore, @@ -2371,7 +2375,7 @@ Pal::Result DevModeMgr::TimedWaitQueueSemaphore( } // ===================================================================================================================== -bool DevModeMgr::IsTracingEnabled() const +bool DevModeRgp::IsTracingEnabled() const { VK_ASSERT(m_finalized); @@ -2386,7 +2390,7 @@ bool DevModeMgr::IsTracingEnabled() const } // ===================================================================================================================== -Pal::Result DevModeMgr::TimedQueueSubmit( +Pal::Result DevModeRgp::TimedQueueSubmit( uint32_t deviceIdx, Queue* pQueue, uint32_t cmdBufferCount, @@ -2467,7 +2471,7 @@ Pal::Result DevModeMgr::TimedQueueSubmit( // ===================================================================================================================== // Registers this pipeline, storing the code object binary and recording a load event in the RGP trace. -void DevModeMgr::PipelineCreated( +void DevModeRgp::PipelineCreated( Device* pDevice, Pipeline* pPipeline) { @@ -2510,7 +2514,7 @@ void DevModeMgr::PipelineCreated( // ===================================================================================================================== // Unregisters this pipeline, recording an unload event in the RGP trace. -void DevModeMgr::PipelineDestroyed( +void DevModeRgp::PipelineDestroyed( Device* pDevice, Pipeline* pPipeline) { @@ -2555,7 +2559,7 @@ void DevModeMgr::PipelineDestroyed( // ===================================================================================================================== // Registers the shader libraries under this pipeline so the contents of each library can be written into the RGP // trace file. -void DevModeMgr::ShaderLibrariesCreated( +void DevModeRgp::ShaderLibrariesCreated( Device* pDevice, RayTracingPipeline* pPipeline) { @@ -2573,7 +2577,7 @@ void DevModeMgr::ShaderLibrariesCreated( // ===================================================================================================================== // Unregisters the shader libraries under this pipeline, recording an unload event in the RGP trace. -void DevModeMgr::ShaderLibrariesDestroyed( +void DevModeRgp::ShaderLibrariesDestroyed( Device* pDevice, RayTracingPipeline* pPipeline) { @@ -2591,7 +2595,7 @@ void DevModeMgr::ShaderLibrariesDestroyed( // ===================================================================================================================== // Retrieves the target API PSO hash from the RGP Server -uint64_t DevModeMgr::GetInstructionTraceTargetHash() +uint64_t DevModeRgp::GetInstructionTraceTargetHash() { uint64_t targetHash = InvalidTargetPipelineHash; @@ -2610,7 +2614,7 @@ uint64_t DevModeMgr::GetInstructionTraceTargetHash() // ===================================================================================================================== // Starts instruction trace -void DevModeMgr::StartInstructionTrace( +void DevModeRgp::StartInstructionTrace( CmdBuffer* pCmdBuffer) { if (IsTracingEnabled()) @@ -2624,7 +2628,7 @@ void DevModeMgr::StartInstructionTrace( // ===================================================================================================================== // Stops instruction trace -void DevModeMgr::StopInstructionTrace( +void DevModeRgp::StopInstructionTrace( CmdBuffer* pCmdBuffer) { if (IsTracingEnabled()) @@ -2639,7 +2643,7 @@ void DevModeMgr::StopInstructionTrace( // ===================================================================================================================== // Registers a pipeline binary cache object with the pipeline URI service and initializes the pipeline URI service // the first time a pipeline binary cache object is registered -Util::Result DevModeMgr::RegisterPipelineCache( +Util::Result DevModeRgp::RegisterPipelineCache( PipelineBinaryCache* pPipelineCache, uint32_t postSizeLimit) { @@ -2693,7 +2697,7 @@ Util::Result DevModeMgr::RegisterPipelineCache( // ===================================================================================================================== // Deregisters a pipeline binary cache with the pipeline URI service -void DevModeMgr::DeregisterPipelineCache( +void DevModeRgp::DeregisterPipelineCache( PipelineBinaryCache* pPipelineCache) { Util::RWLockAuto readWriteLock(&m_pipelineReinjectionLock); @@ -2718,6 +2722,60 @@ void DevModeMgr::DeregisterPipelineCache( } } +// ===================================================================================================================== +bool DevModeRgp::IsQueueTimingActive( + const Device* pDevice + ) const +{ + return (m_trace.queueTimingEnabled && + (m_trace.status == TraceStatus::Running || + m_trace.status == TraceStatus::Preparing || + m_trace.status == TraceStatus::WaitingForSqtt) && + (pDevice->VkPhysicalDevice(DefaultDeviceIndex) == m_trace.pDevice->VkPhysicalDevice(DefaultDeviceIndex))); +} + +// ===================================================================================================================== +bool DevModeRgp::GetTraceFrameBeginTag( + uint64_t* pTag + ) const +{ + bool active; + + if (m_trace.status != TraceStatus::Idle) + { + *pTag = m_traceFrameBeginTag; + + active = true; + } + else + { + active = false; + } + + return active; +} + +// ===================================================================================================================== +bool DevModeRgp::GetTraceFrameEndTag( + uint64_t* pTag + ) const +{ + bool active; + + if (m_trace.status != TraceStatus::Idle) + { + *pTag = m_traceFrameEndTag; + + active = true; + } + else + { + active = false; + } + + return active; +} + }; // namespace vk #endif diff --git a/icd/api/devmode/devmode_rgp.h b/icd/api/devmode/devmode_rgp.h new file mode 100644 index 00000000..d24eb82e --- /dev/null +++ b/icd/api/devmode/devmode_rgp.h @@ -0,0 +1,320 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** +*********************************************************************************************************************** +* @file devmode_rgp.h +* @brief Contains the RGP implementation of the GPU Open Developer Mode (DevModeRgp) +*********************************************************************************************************************** +*/ + +#ifndef __DEVMODE_DEVMODE_RGP_H__ +#define __DEVMODE_DEVMODE_RGP_H__ + +#pragma once + +#include "devmode/devmode_mgr.h" + +// PAL headers +#include "palVector.h" + +// gpuutil headers +#include "gpuUtil/palGpaSession.h" +#if ICD_GPUOPEN_DEVMODE_BUILD +// gpuopen headers +#include "gpuopen.h" +#endif + +// PAL forward declarations +namespace Pal +{ +class ICmdBuffer; +class IFence; +class IQueueSemaphore; +struct PalPublicSettings; +} + +// DevDriver forward declarations +namespace DevDriver +{ +class DevDriverServer; +class PipelineUriService; +namespace RGPProtocol +{ +class RGPServer; +} +} + +namespace vk +{ + +// ===================================================================================================================== +// This class provides functionality to interact with the GPU Open Developer Mode message passing service and the rest +// of the driver. +class DevModeRgp final : public IDevMode +{ +#if ICD_GPUOPEN_DEVMODE_BUILD +public: + // Number of frames to wait before collecting a hardware trace. + // Note: This will be replaced in the future by a remotely configurable value provided by the RGP server. + static constexpr uint32_t NumTracePreparationFrames = 4; + + ~DevModeRgp(); + + static VkResult Create(Instance* pInstance, DevModeRgp** ppObject); + + virtual void Finalize( + uint32_t deviceCount, + VulkanSettingsLoader* settingsLoaders[]) override; + + virtual void Destroy() override; + + virtual void NotifyFrameBegin(const Queue* pQueue, FrameDelimiterType delimiterType) override; + virtual void NotifyFrameEnd(const Queue* pQueue, FrameDelimiterType delimiterType) override; + virtual void WaitForDriverResume() override; + virtual void PipelineCreated(Device* pDevice, Pipeline* pPipeline) override; + virtual void PipelineDestroyed(Device* pDevice, Pipeline* pPipeline) override; +#if VKI_RAY_TRACING + virtual void ShaderLibrariesCreated(Device* pDevice, RayTracingPipeline* pPipeline) override; + virtual void ShaderLibrariesDestroyed(Device* pDevice, RayTracingPipeline* pPipeline) override; +#endif + virtual void PostDeviceCreate(Device* pDevice) override; + virtual void PreDeviceDestroy(Device* pDevice) override; + virtual void NotifyPreSubmit() override; + + virtual uint64_t GetInstructionTraceTargetHash() override; + virtual void StartInstructionTrace(CmdBuffer* pCmdBuffer) override; + virtual void StopInstructionTrace(CmdBuffer* pCmdBuffer) override; + + virtual bool IsTracingEnabled() const override; + virtual bool IsCrashAnalysisEnabled() const override { return m_crashAnalysisEnabled; } + + virtual Pal::Result TimedQueueSubmit( + uint32_t deviceIdx, + Queue* pQueue, + uint32_t cmdBufferCount, + const VkCommandBuffer* pCommandBuffers, + const Pal::SubmitInfo& submitInfo, + VirtualStackFrame* pVirtStackFrame) override; + + virtual Pal::Result TimedSignalQueueSemaphore( + uint32_t deviceIdx, + Queue* pQueue, + VkSemaphore semaphore, + uint64_t value, + Pal::IQueueSemaphore* pQueueSemaphore) override; + + virtual Pal::Result TimedWaitQueueSemaphore( + uint32_t deviceIdx, + Queue* pQueue, + VkSemaphore semaphore, + uint64_t value, + Pal::IQueueSemaphore* pQueueSemaphore) override; + + virtual bool IsQueueTimingActive(const Device* pDevice) const override; + virtual bool GetTraceFrameBeginTag(uint64_t* pTag) const override; + virtual bool GetTraceFrameEndTag(uint64_t* pTag) const override; + + virtual Util::Result RegisterPipelineCache( + PipelineBinaryCache* pPipelineCache, + uint32_t postSizeLimit) override; + + virtual void DeregisterPipelineCache( + PipelineBinaryCache* pPipelineCache) override; + + Util::ListIterator GetPipelineCacheListIterator() + { return m_pipelineCaches.Begin(); } + + Util::RWLock* GetPipelineReinjectionLock() + { return &m_pipelineReinjectionLock; } + +private: + static constexpr uint32_t MaxTraceQueueFamilies = Queue::MaxQueueFamilies; + static constexpr uint32_t MaxTraceQueues = MaxTraceQueueFamilies * Queue::MaxQueuesPerFamily; + + // Various trigger modes supported for RGP traces + enum class TriggerMode : uint32_t + { + Present = 0, // Traces triggered by presents + Index, // Traces triggered by frame indices + Tag // Traces triggered by command buffer tags + }; + + // Steps that an RGP trace goes through + enum class TraceStatus : uint32_t + { + // "Pre-trace" stages: + Idle = 0, // No active trace and none requested + Pending, // We've identified that a trace has been requested and we've received its parameters, + // but we have not yet seen the first frame. + Preparing, // A trace has been requested but is not active yet because we are + // currently sampling timing information over some number of lead frames. + Running, // SQTT and queue timing is currently active for all command buffer submits. + + // "Post-trace" stages: + WaitingForSqtt, // Command to turn off SQTT has been submitted and we're waiting for fence confirmation. + Ending // Tracing is no longer active, but all results are not yet ready. + }; + + // Queue family (type)-specific state to support RGP tracing (part of device state) + struct TraceQueueFamilyState + { + uint32_t queueFamilyIndex; + Pal::QueueType queueType; + Pal::EngineType engineType; + Pal::ICmdBuffer* pTraceBeginCmdBuf; + Pal::ICmdBuffer* pTraceBeginSqttCmdBuf; + Pal::ICmdBuffer* pTraceEndSqttCmdBuf; + Pal::ICmdBuffer* pTraceEndCmdBuf; + Pal::ICmdBuffer* pTraceFlushCmdBuf; + bool supportsTracing; + bool usedForBegin; + bool usedForEndSqtt; + bool usedForEnd; + }; + + // Queue-specific resources to support RGP tracing (part of device state) + struct TraceQueueState + { + const Queue* pQueue; + TraceQueueFamilyState* pFamily; + Pal::uint64 queueId; + Pal::uint64 queueContext; + bool timingSupported; + }; + + // All per-device state to support RGP tracing + struct TraceState + { + TraceStatus status; // Current trace status (idle, running, etc.) + bool labelDelimsPresent; // True is a label delimiter is recieved + + Device* pDevice; // The device currently doing the tracing + Pal::ICmdAllocator* pCmdAllocator; // Command allocator for creating trace-begin/end buffers + Pal::IFence* pBeginFence; // Fence that is signaled when a trace-begin cmdbuf retires + Pal::IFence* pEndSqttFence; // Fence that is signaled when a trace-end cmdbuf retires + Pal::IFence* pEndFence; // Fence that is signaled when a trace-end cmdbuf retires + TraceQueueState* pTracePrepareQueue; // The queue that triggered the full start of a trace + TraceQueueState* pTraceBeginQueue; // The queue that triggered starting SQTT + TraceQueueState* pTraceEndSqttQueue; // The queue that triggered ending SQTT + TraceQueueState* pTraceEndQueue; // The queue that triggered the full end of a trace + + GpuUtil::GpaSession* pGpaSession; // GPA session helper object for building RGP data + uint32_t gpaSampleId; // Sample ID associated with the current trace + bool queueTimingEnabled; // Queue timing is enabled + bool flushAllQueues; // Flushes all queues during the last preparation frame. + + // Queue-specific state/information for tracing: + uint32_t queueCount; + TraceQueueState queueState[MaxTraceQueues]; + uint32_t auxQueueCount; + TraceQueueState auxQueueStates[MaxTraceQueues]; // Used for queues belonging to other logical devices + // pointing to the same physical device + uint32_t queueFamilyCount; + TraceQueueFamilyState queueFamilyState[MaxTraceQueueFamilies]; + + uint32_t activeCmdBufCount; // Number of command buffers in below list + Pal::ICmdBuffer* pActiveCmdBufs[4]; // List of command buffers that need to be reset at end of trace + uint32_t preparedFrameCount; // Number of frames counted while preparing for a trace + uint32_t sqttFrameCount; // Number of frames counted while SQTT tracing is active + uint64_t frameBeginTag; // If a command buffer with this debug-tag is submitted, it is + // treated as a virtual frame-start event. + uint64_t frameEndTag; // Similarly to above but for frame-end post-submit. + }; + + DevModeRgp(Instance* pInstance); + + Pal::Result Init(); + + Pal::Result CheckTraceDeviceChanged(TraceState* pState, Device* pNewDevice); + + Pal::Result InitRGPTracing(TraceState* pState, Device* pDevice); + void DestroyRGPTracing(TraceState* pState); + + Pal::Result InitTraceQueueResources(TraceState* pState, bool* pHasDebugVmid, const Queue* pQueue, bool auxQueue); + Pal::Result InitTraceQueueResourcesForDevice(TraceState* pState, bool* pHasDebugVmid); + Pal::Result InitTraceQueueFamilyResources(TraceState* pTraceState, TraceQueueFamilyState* pFamilyState); + void DestroyTraceQueueFamilyResources(TraceQueueFamilyState* pState); + TraceQueueState* FindTraceQueueState(TraceState* pState, const Queue* pQueue); + bool QueueSupportsTiming(uint32_t deviceIdx, const Queue* pQueue); + + // RGP trace state functionality + void AdvanceActiveTraceStep( + TraceState* pState, + const Queue* pQueue, + bool beginFrame, + FrameDelimiterType delimiterType); + void TraceIdleToPendingStep(TraceState* pState); + Pal::Result TracePendingToPreparingStep( + TraceState* pState, + const Queue* pQueue, + FrameDelimiterType delimiterType); + Pal::Result TracePreparingToRunningStep(TraceState* pState, const Queue* pQueue); + Pal::Result TraceRunningToWaitingForSqttStep(TraceState* pState, const Queue* pQueue); + Pal::Result TraceWaitingForSqttToEndingStep(TraceState* pState, const Queue* pQueue); + Pal::Result TraceEndingToIdleStep(TraceState* pState); + void FinishOrAbortTrace(TraceState* pState, bool aborted); + + Instance* m_pInstance; + DevDriver::DevDriverServer* m_pDevDriverServer; + DevDriver::RGPProtocol::RGPServer* m_pRGPServer; + DevDriver::PipelineUriService* m_pPipelineUriService; + Util::Mutex m_traceMutex; + TraceState m_trace; + bool m_finalized; + TriggerMode m_triggerMode; // Current trigger mode for RGP frame trace + uint32_t m_numPrepFrames; + uint32_t m_traceGpuMemLimit; + bool m_enableInstTracing; // Enable instruction-level SQTT tokens + bool m_enableSampleUpdates; + bool m_allowComputePresents; + bool m_blockingTraceEnd; // Wait on trace-end fences immediately. + uint32_t m_globalFrameIndex; + uint64_t m_traceFrameBeginTag; + uint64_t m_traceFrameEndTag; + uint32_t m_traceFrameBeginIndex; + uint32_t m_traceFrameEndIndex; + uint64_t m_targetApiPsoHash; + uint32_t m_seMask; // Shader engine mask + bool m_perfCountersEnabled; // True if perf counters are enabled + uint64_t m_perfCounterMemLimit; // Memory limit for perf counters + uint32_t m_perfCounterFrequency; // Counter sample frequency + bool m_useStaticVmid; + bool m_staticVmidActive; + bool m_crashAnalysisEnabled; + + using PerfCounterList = Util::Vector; + + PerfCounterList m_perfCounterIds; // List of perf counter ids + + using PipelineCacheList = Util::List; + + PipelineCacheList m_pipelineCaches; + Util::RWLock m_pipelineReinjectionLock; +#endif +}; + +} + +#endif /* __DEVMODE_DEVMODE_RGP_H__ */ diff --git a/icd/api/devmode/devmode_ubertrace.cpp b/icd/api/devmode/devmode_ubertrace.cpp new file mode 100644 index 00000000..0c55ce31 --- /dev/null +++ b/icd/api/devmode/devmode_ubertrace.cpp @@ -0,0 +1,577 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file devmode_ubertrace.cpp + * @brief Contains UberTrace implementation of the GPU Open Developer Mode manager + *********************************************************************************************************************** + */ + +#if ICD_GPUOPEN_DEVMODE_BUILD +// Vulkan headers +#include "devmode/devmode_ubertrace.h" +#include "include/vk_cmdbuffer.h" +#include "include/vk_instance.h" +#include "include/vk_pipeline.h" +#include "include/vk_graphics_pipeline.h" +#include "include/vk_graphics_pipeline_library.h" +#include "include/vk_physical_device.h" +#include "include/vk_utils.h" +#include "include/vk_conv.h" +#include "include/pipeline_binary_cache.h" +#include "sqtt/sqtt_layer.h" +#include "sqtt/sqtt_mgr.h" + +// PAL headers +#include "pal.h" +#include "palCodeObjectTraceSource.h" +#include "palQueueTimingsTraceSource.h" + +// gpuopen headers +#include "devDriverServer.h" +#include "msgChannel.h" +#include "msgTransport.h" +#include "protocols/driverControlServer.h" +#include "protocols/ddPipelineUriService.h" +#include "protocols/ddEventServer.h" + +#if VKI_RAY_TRACING +#include "raytrace/vk_ray_tracing_pipeline.h" +#endif + +namespace vk +{ + +// ===================================================================================================================== +DevModeUberTrace::DevModeUberTrace( + Instance* pInstance) + : + m_pInstance(pInstance), + m_pDevDriverServer(pInstance->PalPlatform()->GetDevDriverServer()), + m_finalized(false), + m_crashAnalysisEnabled(false), + m_globalFrameIndex(1), // Must start from 1 according to RGP spec + m_pTraceSession(pInstance->PalPlatform()->GetTraceSession()), + m_pCodeObjectTraceSource(nullptr), + m_pQueueTimingsTraceSource(nullptr) +{ +} + +// ===================================================================================================================== +DevModeUberTrace::~DevModeUberTrace() +{ + DestroyUberTraceResources(); +} + +// ===================================================================================================================== +// Creates the UberTrace GPU Open Developer Mode manager class. +VkResult DevModeUberTrace::Create( + Instance* pInstance, + DevModeUberTrace** ppObject) +{ + Pal::Result result = Pal::Result::Success; + + void* pStorage = pInstance->AllocMem(sizeof(DevModeUberTrace), VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + + if (pStorage != nullptr) + { + DevModeUberTrace* pMgr = VK_PLACEMENT_NEW(pStorage) DevModeUberTrace(pInstance); + + if (result == Pal::Result::Success) + { + *ppObject = pMgr; + } + else + { + pMgr->Destroy(); + } + } + else + { + result = Pal::Result::ErrorOutOfMemory; + } + + return PalToVkResult(result); +} + +// ===================================================================================================================== +void DevModeUberTrace::Finalize( + uint32_t deviceCount, + VulkanSettingsLoader* settingsLoaders[]) +{ + m_pDevDriverServer->GetDriverControlServer()->StartLateDeviceInit(); + + // Finalize the devmode manager + m_pDevDriverServer->Finalize(); + + m_crashAnalysisEnabled = m_pInstance->PalPlatform()->IsCrashAnalysisModeEnabled(); + + m_finalized = true; +} + +// ===================================================================================================================== +void DevModeUberTrace::Destroy() +{ + Util::Destructor(this); + m_pInstance->FreeMem(this); +} + +// ===================================================================================================================== +void DevModeUberTrace::NotifyFrameBegin( + const Queue* pQueue, + FrameDelimiterType delimiterType) +{ + // Wait for the driver to be resumed in case it's been paused. + WaitForDriverResume(); + + m_pInstance->PalPlatform()->UpdateFrameTraceController(pQueue->PalQueue(DefaultDeviceIndex)); +} + +// ===================================================================================================================== +void DevModeUberTrace::NotifyFrameEnd( + const Queue* pQueue, + FrameDelimiterType delimiterType) +{ + if (IsQueueTimingActive(pQueue->VkDevice())) + { + // Call TimedQueuePresent() to insert commands that collect GPU timestamp. + Pal::IQueue* pPalQueue = pQueue->PalQueue(DefaultDeviceIndex); + + // Currently nothing in the PresentInfo struct is used for inserting a timed present marker. + GpuUtil::TimedQueuePresentInfo timedPresentInfo = {}; + Pal::Result result = m_pQueueTimingsTraceSource->TimedQueuePresent(pPalQueue, timedPresentInfo); + + VK_ASSERT(result == Pal::Result::Success); + } + + m_globalFrameIndex++; +} + +// ===================================================================================================================== +// Waits for the driver to be resumed if it's currently paused. +void DevModeUberTrace::WaitForDriverResume() +{ + auto* pDriverControlServer = m_pDevDriverServer->GetDriverControlServer(); + + VK_ASSERT(pDriverControlServer != nullptr); + pDriverControlServer->DriverTick(); +} + +// ===================================================================================================================== +void DevModeUberTrace::PipelineCreated( + Device* pDevice, + Pipeline* pPipeline) +{ + if (m_pCodeObjectTraceSource != nullptr) + { + GpuUtil::RegisterPipelineInfo pipelineInfo = { 0 }; + pipelineInfo.apiPsoHash = pPipeline->GetApiHash(); + if (pPipeline->PalPipeline(DefaultDeviceIndex) != nullptr) + { + bool isGplPipeline = false; + GraphicsPipeline* pGraphicsPipeline = nullptr; + if (pPipeline->GetType() == VK_PIPELINE_BIND_POINT_GRAPHICS) + { + pGraphicsPipeline = reinterpret_cast(pPipeline); + isGplPipeline = pGraphicsPipeline->GetPalShaderLibrary(GraphicsLibraryPreRaster) != nullptr; + } + + if (isGplPipeline) + { + GpuUtil::RegisterLibraryInfo libInfo = { pipelineInfo.apiPsoHash }; + for (uint32_t i = 0; i < GraphicsLibraryCount; i++) + { + const Pal::IShaderLibrary* pLib = + pGraphicsPipeline->GetPalShaderLibrary(static_cast(i)); + if (pLib != nullptr) + { + m_pCodeObjectTraceSource->RegisterLibrary(pLib, libInfo); + } + } + } + else + { + m_pCodeObjectTraceSource->RegisterPipeline(pPipeline->PalPipeline(DefaultDeviceIndex), pipelineInfo); + } + } + } +} + +// ===================================================================================================================== +void DevModeUberTrace::PipelineDestroyed( + Device* pDevice, + Pipeline* pPipeline) +{ + if (m_pCodeObjectTraceSource != nullptr) + { + if (pPipeline->PalPipeline(DefaultDeviceIndex) != nullptr) + { + bool isGplPipeline = false; + if (pPipeline->GetType() == VK_PIPELINE_BIND_POINT_GRAPHICS) + { + GraphicsPipeline* pGraphicsPipeline = reinterpret_cast(pPipeline); + isGplPipeline = pGraphicsPipeline->GetPalShaderLibrary(GraphicsLibraryPreRaster) != nullptr; + } + + if (isGplPipeline == false) + { + m_pCodeObjectTraceSource->UnregisterPipeline(pPipeline->PalPipeline(DefaultDeviceIndex)); + } + } + else + { + if (pPipeline->GetType() == VK_PIPELINE_BIND_POINT_GRAPHICS) + { + GraphicsPipelineLibrary* pGraphicsLibrary = reinterpret_cast(pPipeline); + const Pal::IShaderLibrary* pPalLibraries[GraphicsLibraryCount] = {}; + pGraphicsLibrary->GetOwnedPalShaderLibraries(pPalLibraries); + for (uint32_t i = 0; i < GraphicsLibraryCount; i++) + { + if (pPalLibraries[i] != nullptr) + { + m_pCodeObjectTraceSource->UnregisterLibrary(pPalLibraries[i]); + } + } + } + } + } +} + +#if VKI_RAY_TRACING +// ===================================================================================================================== +void DevModeUberTrace::ShaderLibrariesCreated( + Device* pDevice, + RayTracingPipeline* pPipeline) +{ + if (m_pCodeObjectTraceSource != nullptr) + { + for (uint32_t i = 0; i < pPipeline->GetShaderLibraryCount(); ++i) + { + GpuUtil::RegisterLibraryInfo pipelineInfo = { pPipeline->GetApiHash() }; + m_pCodeObjectTraceSource->RegisterLibrary(pPipeline->PalShaderLibrary(i), pipelineInfo); + } + } +} + +// ===================================================================================================================== +void DevModeUberTrace::ShaderLibrariesDestroyed( + Device* pDevice, + RayTracingPipeline* pPipeline) +{ + if (m_pCodeObjectTraceSource != nullptr) + { + for (uint32_t i = 0; i < pPipeline->GetShaderLibraryCount(); ++i) + { + m_pCodeObjectTraceSource->UnregisterLibrary(pPipeline->PalShaderLibrary(i)); + } + } +} +#endif + +// ===================================================================================================================== +Pal::Result DevModeUberTrace::RegisterQueuesForDevice( + Device* pDevice) +{ + Pal::Result result = Pal::Result::Success; + + for (uint32_t familyIdx = 0; familyIdx < Queue::MaxQueueFamilies; ++familyIdx) + { + for (uint32_t queueIdx = 0; + (queueIdx < Queue::MaxQueuesPerFamily) && (result == Pal::Result::Success); + ++queueIdx) + { + VkQueue queueHandle = VK_NULL_HANDLE; + pDevice->GetQueue(familyIdx, queueIdx, &queueHandle); + + if (queueHandle != VK_NULL_HANDLE) + { + Queue* pQueue = ApiQueue::ObjectFromHandle(queueHandle); + Pal::IQueue* pPalQueue = pQueue->PalQueue(DefaultDeviceIndex); + + // Get the OS context handle for this queue (this is a thing that RGP needs on DX clients; + // it may be optional for Vulkan, but we provide it anyway if available). + Pal::KernelContextInfo kernelCxtInfo = {}; + Pal::Result resultQueryKernel = pPalQueue->QueryKernelContextInfo(&kernelCxtInfo); + + uint64_t queueId = reinterpret_cast(ApiQueue::FromObject(pQueue)); + uint64_t queueContext = (resultQueryKernel == Pal::Result::Success) + ? kernelCxtInfo.contextIdentifier + : 0; + + result = m_pQueueTimingsTraceSource->RegisterTimedQueue(pPalQueue, queueId, queueContext); + } + } + } + + return result; +} + +// ===================================================================================================================== +void DevModeUberTrace::PostDeviceCreate( + Device* pDevice) +{ + Pal::Result result = InitUberTraceResources(pDevice->PalDevice(DefaultDeviceIndex)); + + if (result == Pal::Result::Success) + { + result = RegisterQueuesForDevice(pDevice); + } + + VK_ASSERT(result == Pal::Result::Success); + + auto* pDriverControlServer = m_pDevDriverServer->GetDriverControlServer(); + + VK_ASSERT(pDriverControlServer != nullptr); + + // If the driver hasn't been marked as fully initialized yet, mark it now. We consider the time after the logical + // device creation to be the fully initialized driver position. This is mainly because PAL is fully initialized + // at this point and we also know whether or not the debug vmid has been acquired. External tools use this + // information to decide when it's reasonable to make certain requests of the driver through protocol functions. + if (pDriverControlServer->IsDriverInitialized() == false) + { + pDriverControlServer->FinishDeviceInit(); + } +} + +// ===================================================================================================================== +bool DevModeUberTrace::IsTracingEnabled() const +{ + return m_pTraceSession->IsTracingEnabled(); +} + +// ===================================================================================================================== +Pal::Result DevModeUberTrace::TimedQueueSubmit( + uint32_t deviceIdx, + Queue* pQueue, + uint32_t cmdBufferCount, + const VkCommandBuffer* pCommandBuffers, + const Pal::SubmitInfo& submitInfo, + VirtualStackFrame* pVirtStackFrame) +{ + VK_ASSERT(cmdBufferCount == submitInfo.pPerSubQueueInfo[0].cmdBufferCount); + + bool timingSupported = IsQueueTimingActive(pQueue->VkDevice()) && (submitInfo.pPerSubQueueInfo[0].cmdBufferCount > 0); + + // Fill in extra meta-data information to associate the API command buffer data with the generated + // timing information. + GpuUtil::TimedSubmitInfo timedSubmitInfo = {}; + Pal::uint64* pApiCmdBufIds = nullptr; + Pal::uint32* pSqttCmdBufIds = nullptr; + + if (timingSupported) + { + pApiCmdBufIds = pVirtStackFrame->AllocArray(cmdBufferCount); + pSqttCmdBufIds = pVirtStackFrame->AllocArray(cmdBufferCount); + + timedSubmitInfo.pApiCmdBufIds = pApiCmdBufIds; + timedSubmitInfo.pSqttCmdBufIds = pSqttCmdBufIds; + timedSubmitInfo.frameIndex = m_globalFrameIndex; + + timingSupported &= (pApiCmdBufIds != nullptr) && (pSqttCmdBufIds != nullptr); + } + + Pal::Result result = Pal::Result::NotReady; + + Pal::IQueue* pPalQueue = pQueue->PalQueue(deviceIdx); + + if (timingSupported) + { + for (uint32_t cbIdx = 0; cbIdx < cmdBufferCount; ++cbIdx) + { + uintptr_t intHandle = reinterpret_cast(pCommandBuffers[cbIdx]); + + pApiCmdBufIds[cbIdx] = intHandle; + + CmdBuffer* pCmdBuf = ApiCmdBuffer::ObjectFromHandle(pCommandBuffers[cbIdx]); + + pSqttCmdBufIds[cbIdx] = 0; + + if (pCmdBuf->GetSqttState() != nullptr) + { + pSqttCmdBufIds[cbIdx] = pCmdBuf->GetSqttState()->GetId().u32All; + } + + VK_ASSERT(pCmdBuf->PalCmdBuffer(DefaultDeviceIndex) == submitInfo.pPerSubQueueInfo[0].ppCmdBuffers[cbIdx]); + } + + // Do a timed submit of all the command buffers + result = m_pQueueTimingsTraceSource->TimedSubmit(pPalQueue, submitInfo, timedSubmitInfo); + + VK_ASSERT(result == Pal::Result::Success); + } + + // Punt to non-timed submit if a timed submit fails (or is not supported) + if (result != Pal::Result::Success) + { + result = Queue::PalQueueSubmit(pQueue->VkDevice(), pPalQueue, submitInfo); + } + + if (pApiCmdBufIds != nullptr) + { + pVirtStackFrame->FreeArray(pApiCmdBufIds); + } + + if (pSqttCmdBufIds != nullptr) + { + pVirtStackFrame->FreeArray(pSqttCmdBufIds); + } + + return result; +} + +// ===================================================================================================================== +Pal::Result DevModeUberTrace::TimedSignalQueueSemaphore( + uint32_t deviceIdx, + Queue* pQueue, + VkSemaphore semaphore, + uint64_t value, + Pal::IQueueSemaphore* pQueueSemaphore) +{ + Pal::IQueue* pPalQueue = pQueue->PalQueue(deviceIdx); + + Pal::Result result = Pal::Result::NotReady; + + if (IsQueueTimingActive(pQueue->VkDevice())) + { + GpuUtil::TimedQueueSemaphoreInfo timedSemaphoreInfo = {}; + + timedSemaphoreInfo.semaphoreID = (uint64_t)semaphore; + result = m_pQueueTimingsTraceSource->TimedSignalQueueSemaphore(pPalQueue, pQueueSemaphore, timedSemaphoreInfo, value); + + VK_ASSERT(result == Pal::Result::Success); + } + + if (result != Pal::Result::Success) + { + result = pPalQueue->SignalQueueSemaphore(pQueueSemaphore, value); + } + + return result; +} + +// ===================================================================================================================== +Pal::Result DevModeUberTrace::TimedWaitQueueSemaphore( + uint32_t deviceIdx, + Queue* pQueue, + VkSemaphore semaphore, + uint64_t value, + Pal::IQueueSemaphore* pQueueSemaphore) +{ + Pal::IQueue* pPalQueue = pQueue->PalQueue(deviceIdx); + + Pal::Result result = Pal::Result::NotReady; + + if (IsQueueTimingActive(pQueue->VkDevice())) + { + GpuUtil::TimedQueueSemaphoreInfo timedSemaphoreInfo = {}; + + timedSemaphoreInfo.semaphoreID = (uint64_t)semaphore; + result = m_pQueueTimingsTraceSource->TimedWaitQueueSemaphore(pPalQueue, pQueueSemaphore, timedSemaphoreInfo, value); + + VK_ASSERT(result == Pal::Result::Success); + } + + if (result != Pal::Result::Success) + { + result = pPalQueue->WaitQueueSemaphore(pQueueSemaphore, value); + } + + return result; +} + +// ===================================================================================================================== +bool DevModeUberTrace::IsQueueTimingActive( + const Device* /*pDevice*/ + ) const +{ + return (m_pQueueTimingsTraceSource != nullptr) ? m_pQueueTimingsTraceSource->IsTimingInProgress() : false; +} + +// ===================================================================================================================== +Pal::Result DevModeUberTrace::InitUberTraceResources( + Pal::IDevice* pPalDevice) +{ + Pal::Result result = Pal::Result::ErrorOutOfMemory; + + void* pStorage = m_pInstance->AllocMem(sizeof(GpuUtil::CodeObjectTraceSource), VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + + if (pStorage != nullptr) + { + m_pCodeObjectTraceSource = VK_PLACEMENT_NEW(pStorage) + GpuUtil::CodeObjectTraceSource(m_pInstance->PalPlatform()); + + result = m_pTraceSession->RegisterSource(m_pCodeObjectTraceSource); + } + + if (result == Pal::Result::Success) + { + pStorage = m_pInstance->AllocMem(sizeof(GpuUtil::QueueTimingsTraceSource), VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + + if (pStorage != nullptr) + { + m_pQueueTimingsTraceSource = VK_PLACEMENT_NEW(pStorage) + GpuUtil::QueueTimingsTraceSource(m_pInstance->PalPlatform()); + + result = m_pTraceSession->RegisterSource(m_pQueueTimingsTraceSource); + } + else + { + result = Pal::Result::ErrorOutOfMemory; + } + } + + if (result == Pal::Result::Success) + { + result = m_pQueueTimingsTraceSource->Init(pPalDevice); + } + + if (result != Pal::Result::Success) + { + DestroyUberTraceResources(); + } + return result; +} + +// ===================================================================================================================== +void DevModeUberTrace::DestroyUberTraceResources() +{ + if (m_pCodeObjectTraceSource != nullptr) + { + m_pTraceSession->UnregisterSource(m_pCodeObjectTraceSource); + m_pInstance->FreeMem(m_pCodeObjectTraceSource); + m_pCodeObjectTraceSource = nullptr; + } + + if (m_pQueueTimingsTraceSource != nullptr) + { + m_pTraceSession->UnregisterSource(m_pQueueTimingsTraceSource); + m_pInstance->FreeMem(m_pQueueTimingsTraceSource); + m_pQueueTimingsTraceSource = nullptr; + } +} + +} // namespace vk + +#endif diff --git a/icd/api/devmode/devmode_ubertrace.h b/icd/api/devmode/devmode_ubertrace.h new file mode 100644 index 00000000..4710ca17 --- /dev/null +++ b/icd/api/devmode/devmode_ubertrace.h @@ -0,0 +1,148 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** +*********************************************************************************************************************** +* @file devmode_ubertrace.h +* @brief Contains the UberTrace implementation of the GPU Open Developer Mode (DevModeUberTrace) +*********************************************************************************************************************** +*/ + +#ifndef __DEVMODE_DEVMODE_UBERTRACE_H__ +#define __DEVMODE_DEVMODE_UBERTRACE_H__ + +#pragma once + +#include "devmode/devmode_mgr.h" +#include "palTraceSession.h" + +// GPUOpen forward declarations +namespace DevDriver +{ +class DevDriverServer; +} + +namespace GpuUtil +{ +class CodeObjectTraceSource; +class QueueTimingsTraceSource; +} + +namespace vk +{ + +// ===================================================================================================================== +// This class provides functionality to interact with the GPU Open Developer Mode message passing service and the rest +// of the driver. +class DevModeUberTrace final : public IDevMode +{ +#if ICD_GPUOPEN_DEVMODE_BUILD +public: + ~DevModeUberTrace(); + + static VkResult Create(Instance* pInstance, DevModeUberTrace** ppObject); + + virtual void Finalize( + uint32_t deviceCount, + VulkanSettingsLoader* settingsLoaders[]) override; + + virtual void Destroy() override; + + virtual void NotifyFrameBegin(const Queue* pQueue, FrameDelimiterType delimiterType) override; + virtual void NotifyFrameEnd(const Queue* pQueue, FrameDelimiterType delimiterType) override; + virtual void WaitForDriverResume() override; + virtual void PipelineCreated(Device* pDevice, Pipeline* pPipeline) override; + virtual void PipelineDestroyed(Device* pDevice, Pipeline* pPipeline) override; +#if VKI_RAY_TRACING + virtual void ShaderLibrariesCreated(Device* pDevice, RayTracingPipeline* pPipeline) override; + virtual void ShaderLibrariesDestroyed(Device* pDevice, RayTracingPipeline* pPipeline) override; +#endif + virtual void PostDeviceCreate(Device* pDevice) override; + virtual void PreDeviceDestroy(Device* pDevice) override { }; + virtual void NotifyPreSubmit() override { }; + + virtual bool IsTracingEnabled() const override; + virtual bool IsCrashAnalysisEnabled() const override { return m_crashAnalysisEnabled; } + virtual bool IsQueueTimingActive(const Device* pDevice) const override; + + virtual Pal::Result TimedQueueSubmit( + uint32_t deviceIdx, + Queue* pQueue, + uint32_t cmdBufferCount, + const VkCommandBuffer* pCommandBuffers, + const Pal::SubmitInfo& submitInfo, + VirtualStackFrame* pVirtStackFrame) override; + + virtual Pal::Result TimedSignalQueueSemaphore( + uint32_t deviceIdx, + Queue* pQueue, + VkSemaphore semaphore, + uint64_t value, + Pal::IQueueSemaphore* pQueueSemaphore) override; + + virtual Pal::Result TimedWaitQueueSemaphore( + uint32_t deviceIdx, + Queue* pQueue, + VkSemaphore semaphore, + uint64_t value, + Pal::IQueueSemaphore* pQueueSemaphore) override; + + // Deprecated functionality + virtual uint64_t GetInstructionTraceTargetHash() override { return InvalidTargetPipelineHash; }; + virtual void StartInstructionTrace(CmdBuffer* pCmdBuffer) override { }; + virtual void StopInstructionTrace(CmdBuffer* pCmdBuffer) override { }; + + virtual bool GetTraceFrameBeginTag(uint64_t* pTag) const override { return false; }; + virtual bool GetTraceFrameEndTag(uint64_t* pTag) const override { return false; }; + + virtual Util::Result RegisterPipelineCache( + PipelineBinaryCache* pPipelineCache, + uint32_t postSizeLimit) override { return Util::Result::Success; }; + + virtual void DeregisterPipelineCache( + PipelineBinaryCache* pPipelineCache) override { }; + +private: + DevModeUberTrace(Instance* pInstance); + + Pal::Result InitUberTraceResources(Pal::IDevice* pPalDevice); + void DestroyUberTraceResources(); + + Pal::Result RegisterQueuesForDevice(Device* pDevice); + + Instance* m_pInstance; + DevDriver::DevDriverServer* m_pDevDriverServer; + bool m_finalized; + bool m_crashAnalysisEnabled; + uint32_t m_globalFrameIndex; + + GpuUtil::TraceSession* m_pTraceSession; + GpuUtil::CodeObjectTraceSource* m_pCodeObjectTraceSource; + GpuUtil::QueueTimingsTraceSource* m_pQueueTimingsTraceSource; +#endif +}; + +} + +#endif /* __DEVMODE_DEVMODE_UBERTRACE_H__ */ diff --git a/icd/api/entry.cpp b/icd/api/entry.cpp index c45d695b..23728f08 100644 --- a/icd/api/entry.cpp +++ b/icd/api/entry.cpp @@ -374,6 +374,41 @@ VKAPI_ATTR void VKAPI_CALL vkCmdDispatchIndirect( ApiCmdBuffer::ObjectFromHandle(cmdBuffer)->DispatchIndirect(buffer, offset); } +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdPreprocessGeneratedCommandsNV( + VkCommandBuffer commandBuffer, + const VkGeneratedCommandsInfoNV* pGeneratedCommandsInfo) +{ +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdExecuteGeneratedCommandsNV( + VkCommandBuffer commandBuffer, + VkBool32 isPreprocessed, + const VkGeneratedCommandsInfoNV* pGeneratedCommandsInfo) +{ + ApiCmdBuffer::ObjectFromHandle(commandBuffer)->ExecuteIndirect(isPreprocessed, pGeneratedCommandsInfo); +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdBindPipelineShaderGroupNV( + VkCommandBuffer commandBuffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipeline pipeline, + uint32_t groupIndex) +{ + VK_NOT_IMPLEMENTED; +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdUpdatePipelineIndirectBufferNV( + VkCommandBuffer commandBuffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipeline pipeline) +{ + VK_NOT_IMPLEMENTED; +} + // ===================================================================================================================== VKAPI_ATTR void VKAPI_CALL vkCmdCopyBuffer( VkCommandBuffer cmdBuffer, diff --git a/icd/api/graphics_pipeline_common.cpp b/icd/api/graphics_pipeline_common.cpp index e5f5e0a2..7a036e53 100644 --- a/icd/api/graphics_pipeline_common.cpp +++ b/icd/api/graphics_pipeline_common.cpp @@ -199,11 +199,15 @@ static void BuildPalColorBlendStateCreateInfo( { uint32_t location = i; - if ((extStructs.pRenderingAttachmentLocationInfo != nullptr) && - (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations != nullptr) && - (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations[i] != VK_ATTACHMENT_UNUSED)) + if ((extStructs.pRenderingAttachmentLocationInfo != nullptr) && + (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations != nullptr)) { location = extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations[i]; + + if (location == VK_ATTACHMENT_UNUSED) + { + continue; + } } const VkPipelineColorBlendAttachmentState& attachmentState = pColorBlendState->pAttachments[i]; @@ -820,7 +824,7 @@ VkResult GraphicsPipelineCommon::Create( } else if (pDevice->GetRuntimeSettings().pipelineLinkOptimizationMode == PipelineLinkOptimizationAlwaysOptimized) { - flags |= ~VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT; + flags |= VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT; } if ((flags & VK_PIPELINE_CREATE_LIBRARY_BIT_KHR) != 0) @@ -1642,11 +1646,15 @@ static void BuildColorBlendState( { uint32_t location = i; - if ((extStructs.pRenderingAttachmentLocationInfo != nullptr) && - (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations != nullptr) && - (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations[i] != VK_ATTACHMENT_UNUSED)) + if ((extStructs.pRenderingAttachmentLocationInfo != nullptr) && + (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations != nullptr)) { location = extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations[i]; + + if (location == VK_ATTACHMENT_UNUSED) + { + continue; + } } auto pCbDst = &pInfo->pipeline.cbState.target[location]; diff --git a/icd/api/include/app_profile.h b/icd/api/include/app_profile.h index bf7757fe..0c2f8811 100644 --- a/icd/api/include/app_profile.h +++ b/icd/api/include/app_profile.h @@ -112,8 +112,6 @@ enum class AppProfile : uint32_t SniperElite5, // Sniper Elite 5 by Rebellion SeriousSamVrTheLastHope, // Serious Sam VR The Last Hope by Croteam BaldursGate3, // Baldur's Gate by Larian Studios - Enshrouded, // Enshrouded by Keen Games - HolisticEngine, // Holistic Engine by Keen Games #if VKI_RAY_TRACING ControlDX12, // VKD3D Control Ultimate Edition RayTracingWeekends, // RayTracingInVulkan demo diff --git a/icd/api/include/app_shader_optimizer.h b/icd/api/include/app_shader_optimizer.h index f2b848ca..f3a1fc7b 100644 --- a/icd/api/include/app_shader_optimizer.h +++ b/icd/api/include/app_shader_optimizer.h @@ -177,7 +177,7 @@ class ShaderOptimizer const PipelineOptimizerKey& pipelineKey, Pal::DynamicComputeShaderInfo* pDynamicComputeShaderInfo) const; - void ApplyProfileToDynamicGraphicsShaderInfo( + bool ApplyProfileToDynamicGraphicsShaderInfo( const ShaderProfileAction& action, Pal::DynamicGraphicsShaderInfo* pGraphicsShaderInfo) const; diff --git a/icd/api/include/compiler_solution.h b/icd/api/include/compiler_solution.h index ca4c3a64..2ad93eb0 100644 --- a/icd/api/include/compiler_solution.h +++ b/icd/api/include/compiler_solution.h @@ -86,6 +86,7 @@ struct LlpcShaderLibraryBlobHeader { uint32_t binaryLength; // Partial ELF binary length uint32_t fragMetaLength; // Fragment shader metadata length + bool requireFullPipeline; // Whether require full pipeline }; // ===================================================================================================================== // Pipeline Creation feedback info. @@ -175,6 +176,16 @@ static GraphicsLibraryType GetGraphicsLibraryType( return stage == ShaderStage::ShaderStageFragment ? GraphicsLibraryFragment : GraphicsLibraryPreRaster; } +// ===================================================================================================================== +static VkGraphicsPipelineLibraryFlagBitsEXT GetVkGraphicsLibraryFlagBit( + const ShaderStage stage) +{ + VK_ASSERT(stage < ShaderStage::ShaderStageGfxCount); + return stage == ShaderStage::ShaderStageFragment ? + VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT : + VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT; +} + // ===================================================================================================================== struct GraphicsPipelineBinaryCreateInfo { diff --git a/icd/api/include/defer_compile_thread.h b/icd/api/include/defer_compile_thread.h index 6c53624b..c26c742e 100644 --- a/icd/api/include/defer_compile_thread.h +++ b/icd/api/include/defer_compile_thread.h @@ -114,7 +114,7 @@ class DeferCompileThread final : public Util::Thread while (m_stop == false) { // Waits for new signal. - m_event.Wait(1.0f); + m_event.Wait(Util::fseconds{ 1.0f }); m_event.Reset(); DeferredCompileWorkload task; diff --git a/icd/api/include/graphics_pipeline_common.h b/icd/api/include/graphics_pipeline_common.h index 494fd761..0de05e0c 100644 --- a/icd/api/include/graphics_pipeline_common.h +++ b/icd/api/include/graphics_pipeline_common.h @@ -334,9 +334,9 @@ class GraphicsPipelineCommon : public Pipeline // Constructor of GraphicsPipelineCommon GraphicsPipelineCommon( #if VKI_RAY_TRACING - bool hasRayTracing, + bool hasRayTracing, #endif - Device* const pDevice) + Device* const pDevice) : Pipeline( pDevice, #if VKI_RAY_TRACING diff --git a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h index e2656eeb..6d09e280 100644 --- a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h +++ b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h @@ -69,7 +69,7 @@ extern "C" { #define VK_API_VERSION_1_0 VK_MAKE_API_VERSION(0, 1, 0, 0)// Patch version should always be set to 0 // Version of this file -#define VK_HEADER_VERSION 279 +#define VK_HEADER_VERSION 280 // Complete version of this file #define VK_HEADER_VERSION_COMPLETE VK_MAKE_API_VERSION(0, 1, 3, VK_HEADER_VERSION) @@ -1111,6 +1111,7 @@ typedef enum VkStructureType { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_POOL_OVERALLOCATION_FEATURES_NV = 1000546000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAW_ACCESS_CHAINS_FEATURES_NV = 1000555000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT16_VECTOR_FEATURES_NV = 1000563000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_TRACING_VALIDATION_FEATURES_NV = 1000568000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES, VK_STRUCTURE_TYPE_DEBUG_REPORT_CREATE_INFO_EXT = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, @@ -19134,6 +19135,18 @@ typedef struct VkPhysicalDeviceShaderAtomicFloat16VectorFeaturesNV { +// VK_NV_ray_tracing_validation is a preprocessor guard. Do not pass it to API calls. +#define VK_NV_ray_tracing_validation 1 +#define VK_NV_RAY_TRACING_VALIDATION_SPEC_VERSION 1 +#define VK_NV_RAY_TRACING_VALIDATION_EXTENSION_NAME "VK_NV_ray_tracing_validation" +typedef struct VkPhysicalDeviceRayTracingValidationFeaturesNV { + VkStructureType sType; + void* pNext; + VkBool32 rayTracingValidation; +} VkPhysicalDeviceRayTracingValidationFeaturesNV; + + + // VK_KHR_acceleration_structure is a preprocessor guard. Do not pass it to API calls. #define VK_KHR_acceleration_structure 1 #define VK_KHR_ACCELERATION_STRUCTURE_SPEC_VERSION 13 diff --git a/icd/api/include/pipeline_binary_cache.h b/icd/api/include/pipeline_binary_cache.h index 4e18cdd8..a244f765 100644 --- a/icd/api/include/pipeline_binary_cache.h +++ b/icd/api/include/pipeline_binary_cache.h @@ -42,7 +42,7 @@ namespace Util class IPlatformKey; #if ICD_GPUOPEN_DEVMODE_BUILD -class DevModeMgr; +class IDevMode; #endif } // namespace Util @@ -64,7 +64,7 @@ class PipelineBinaryCache const vk::RuntimeSettings& settings, const char* pDefaultCacheFilePath, #if ICD_GPUOPEN_DEVMODE_BUILD - vk::DevModeMgr* pDevModeMgr, + vk::IDevMode* pDevMode, #endif uint32_t expectedEntries, size_t initDataSize, @@ -238,7 +238,7 @@ class PipelineBinaryCache Util::ICacheLayer* m_pTopLayer; // Top layer of the cache chain where queries are submitted #if ICD_GPUOPEN_DEVMODE_BUILD - vk::DevModeMgr* m_pDevModeMgr; + vk::IDevMode* m_pDevMode; Util::ICacheLayer* m_pReinjectionLayer; // Reinjection interface layer HashMapping m_hashMapping; // Maps the internalPipelineHash to the appropriate CacheId diff --git a/icd/api/include/pipeline_compiler.h b/icd/api/include/pipeline_compiler.h index 1530c700..e10e5e34 100644 --- a/icd/api/include/pipeline_compiler.h +++ b/icd/api/include/pipeline_compiler.h @@ -115,6 +115,15 @@ struct RayTracingPipelineShaderStageInfo }; #endif +// ===================================================================================================================== +/// Determines whether the given stage info is from shader module identifier. +inline bool IsShaderModuleIdentifier(const Vkgc::PipelineShaderInfo& stageInfo) +{ + return (stageInfo.pModuleData == nullptr) && + ((stageInfo.options.clientHash.lower != 0) || + (stageInfo.options.clientHash.upper != 0)); +} + // ===================================================================================================================== class PipelineCompiler { @@ -458,6 +467,14 @@ class PipelineCompiler static void DumpPipelineMetadata( void* pPipelineDumpHandle, const PipelineMetadata* pBinaryMetadata); + + void DumpPipeline( + const RuntimeSettings& settings, + const Vkgc::PipelineBuildInfo& pipelineInfo, + uint64_t apiPsoHash, + uint32_t binaryCount, + const Vkgc::BinaryData* pElfBinary, + VkResult result); private: PAL_DISALLOW_COPY_AND_ASSIGN(PipelineCompiler); diff --git a/icd/api/include/vk_cmdbuffer.h b/icd/api/include/vk_cmdbuffer.h index 63631471..192d051a 100644 --- a/icd/api/include/vk_cmdbuffer.h +++ b/icd/api/include/vk_cmdbuffer.h @@ -458,6 +458,14 @@ class CmdBuffer VkBuffer countBuffer, VkDeviceSize countOffset); + template< bool indexed, bool useBufferCount> + void DrawIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa); + void DrawMeshTasks( uint32_t x, uint32_t y, @@ -472,6 +480,14 @@ class CmdBuffer VkBuffer countBuffer, VkDeviceSize countOffset); + template + void DrawMeshTasksIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa); + void Dispatch( uint32_t x, uint32_t y, @@ -485,10 +501,17 @@ class CmdBuffer uint32_t dim_y, uint32_t dim_z); + void DispatchIndirect( + VkDeviceSize indirectBufferVa); + void DispatchIndirect( VkBuffer buffer, VkDeviceSize offset); + void ExecuteIndirect( + VkBool32 isPreprocessed, + const VkGeneratedCommandsInfoNV* pInfo); + template void CopyBuffer( VkBuffer srcBuffer, @@ -1428,6 +1451,11 @@ class CmdBuffer } #if VKI_RAY_TRACING + uint64 GetCpsMemSize() const { return m_maxCpsMemSize; } + + void ApplyPatchCpsRequests( + uint32_t deviceIdx, + const Pal::IGpuMemory& cpsMem) const; bool HasRayTracing() const { return m_flags.hasRayTracing; } #endif @@ -1460,11 +1488,10 @@ class CmdBuffer { return &m_debugPrintf; } + private: PAL_DISALLOW_COPY_AND_ASSIGN(CmdBuffer); - uint32 GetHevcDbpIndex(const uint8_t* pRefPicList, uint32 dpbSlot); - void ValidateGraphicsStates(); void ValidateSamplePattern(uint32_t sampleCount, SamplePattern* pSamplePattern); @@ -1823,6 +1850,7 @@ class CmdBuffer const RuntimeSettings& settings, CmdPool* pCmdPool, const RayTracingPipeline* pPipeline, + uint32* pConstMem, Pal::gpusize constGpuAddr, uint32_t width, uint32_t height, @@ -1851,7 +1879,15 @@ class CmdBuffer uint32_t height, uint32_t depth, Buffer* pIndirectBuffer, - VkDeviceSize indirectOffset); + VkDeviceSize indirectOffset, + const Pal::gpusize indirectBufferVa); + + void AddPatchCpsRequest( + uint32_t deviceIdx, + GpuRt::DispatchRaysConstants* pConstsMem, + uint64_t bufSize); + + void FreePatchCpsList(); #endif void InsertDebugMarker( @@ -1943,6 +1979,11 @@ class CmdBuffer bool m_reverseThreadGroupState; #if VKI_RAY_TRACING Util::Vector m_scratchVidMemList; // Ray-tracing scratch memory + + uint64 m_maxCpsMemSize; // max ray sorting memory requested + + typedef Util::Vector PatchCpsVector; + PatchCpsVector m_patchCpsList[MaxPalDevices]; #endif }; @@ -2234,6 +2275,26 @@ VKAPI_ATTR void VKAPI_CALL vkCmdDispatchIndirect( VkBuffer buffer, VkDeviceSize offset); +VKAPI_ATTR void VKAPI_CALL vkCmdPreprocessGeneratedCommandsNV( + VkCommandBuffer commandBuffer, + const VkGeneratedCommandsInfoNV* pGeneratedCommandsInfo); + +VKAPI_ATTR void VKAPI_CALL vkCmdExecuteGeneratedCommandsNV( + VkCommandBuffer commandBuffer, + VkBool32 isPreprocessed, + const VkGeneratedCommandsInfoNV* pGeneratedCommandsInfo); + +VKAPI_ATTR void VKAPI_CALL vkCmdBindPipelineShaderGroupNV( + VkCommandBuffer commandBuffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipeline pipeline, + uint32_t groupIndex); + +VKAPI_ATTR void VKAPI_CALL vkCmdUpdatePipelineIndirectBufferNV( + VkCommandBuffer commandBuffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipeline pipeline); + VKAPI_ATTR void VKAPI_CALL vkCmdDispatchBase( VkCommandBuffer commandBuffer, uint32_t baseGroupX, diff --git a/icd/api/include/vk_compute_pipeline.h b/icd/api/include/vk_compute_pipeline.h index 8ea4e790..2e6e80b3 100644 --- a/icd/api/include/vk_compute_pipeline.h +++ b/icd/api/include/vk_compute_pipeline.h @@ -72,7 +72,7 @@ class ComputePipeline final : public Pipeline, public NonDispatchable(palShaderStageMask); +} + // ===================================================================================================================== struct UberFetchShaderFormatInfo { @@ -4021,6 +4073,26 @@ VkFormat GetLowPrecisionDepthFormat( const VkImageUsageFlags& imageUsage, const RuntimeSettings& settings); +const char* VkResultName(VkResult result); + +inline std::chrono::nanoseconds Uint64ToChronoNano(uint64_t nanoSeconds) +{ + const uint64_t maxNano = static_cast(std::chrono::nanoseconds::max().count()); + return std::chrono::nanoseconds { Util::Min(nanoSeconds, maxNano) }; +} + +inline std::chrono::milliseconds Uint64ToChronoMilli(uint64_t milliSeconds) +{ + const uint64_t maxMilli = static_cast(std::chrono::milliseconds::max().count()); + return std::chrono::milliseconds { Util::Min(milliSeconds, maxMilli) }; +} + +inline std::chrono::seconds Uint64ToChronoSeconds(uint64_t seconds) +{ + const uint64_t maxSeconds = static_cast(std::chrono::seconds::max().count()); + return std::chrono::seconds { Util::Min(seconds, maxSeconds) }; +} + } // namespace vk #endif /* __VK_CONV_H__ */ diff --git a/icd/api/include/vk_device.h b/icd/api/include/vk_device.h index 654dd80e..7024ca43 100644 --- a/icd/api/include/vk_device.h +++ b/icd/api/include/vk_device.h @@ -166,8 +166,9 @@ class Device // True if EXT_PRIMITIVES_GENERATED_QUERY is enabled. uint32 primitivesGeneratedQuery : 1; uint32 reserved1 : 1; + uint32 reserved2 : 1; - uint32 reserved : 13; + uint32 reserved : 12; }; uint32 u32All; @@ -382,6 +383,11 @@ class Device const VkAllocationCallbacks* pAllocator, VkSwapchainKHR* pSwapChain); + VkResult CreateIndirectCommandsLayout( + const VkIndirectCommandsLayoutCreateInfoNV* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkIndirectCommandsLayoutNV* pIndirectCommandsLayout); + VkResult ImportSemaphore( VkSemaphore semaphore, const ImportSemaphoreInfo& importInfo); @@ -1422,6 +1428,22 @@ VKAPI_ATTR void VKAPI_CALL vkGetImageSubresourceLayout2KHR( const VkImageSubresource2KHR* pSubresource, VkSubresourceLayout2KHR* pLayout); +VKAPI_ATTR VkResult VKAPI_CALL vkCreateIndirectCommandsLayoutNV( + VkDevice device, + const VkIndirectCommandsLayoutCreateInfoNV* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkIndirectCommandsLayoutNV* pIndirectCommandsLayout); + +VKAPI_ATTR void VKAPI_CALL vkDestroyIndirectCommandsLayoutNV( + VkDevice device, + VkIndirectCommandsLayoutNV indirectCommandsLayout, + const VkAllocationCallbacks* pAllocator); + +VKAPI_ATTR void VKAPI_CALL vkGetGeneratedCommandsMemoryRequirementsNV( + VkDevice device, + const VkGeneratedCommandsMemoryRequirementsInfoNV* pInfo, + VkMemoryRequirements2* pMemoryRequirements); + } // namespace entry } // namespace vk diff --git a/icd/api/include/vk_extensions.h b/icd/api/include/vk_extensions.h index 7cb5f137..c116faed 100644 --- a/icd/api/include/vk_extensions.h +++ b/icd/api/include/vk_extensions.h @@ -408,6 +408,7 @@ class DeviceExtensions final : public Extensions EXT_MEMORY_PRIORITY, EXT_MESH_SHADER, EXT_MUTABLE_DESCRIPTOR_TYPE, + EXT_NESTED_COMMAND_BUFFER, EXT_NON_SEAMLESS_CUBE_MAP, EXT_PAGEABLE_DEVICE_LOCAL_MEMORY, EXT_PCI_BUS_INFO, @@ -474,6 +475,9 @@ class DeviceExtensions final : public Extensions GOOGLE_USER_TYPE, NV_COMPUTE_SHADER_DERIVATIVES, + NV_DEVICE_GENERATED_COMMANDS, + NV_DEVICE_GENERATED_COMMANDS_COMPUTE, + VALVE_MUTABLE_DESCRIPTOR_TYPE, Count }; diff --git a/icd/api/include/vk_formats.h b/icd/api/include/vk_formats.h index f4b76ca6..7d3a2497 100755 --- a/icd/api/include/vk_formats.h +++ b/icd/api/include/vk_formats.h @@ -51,6 +51,8 @@ struct AstcMappedInfo }; #endif +class PhysicalDevice; + // ===================================================================================================================== // Container for storing compile-time meta-information about Vulkan formats. // @@ -82,6 +84,16 @@ struct Formats #endif static VkExtent3D ElementsToTexels(VkFormat format, const VkExtent3D& extent, const RuntimeSettings& settings); static Pal::Formats::NumericSupportFlags GetNumberFormat(VkFormat format, const RuntimeSettings& settings); + + static VkFormat GetCompatibleSinglePlaneFormat( + VkFormat multiPlaneFormat, + uint32_t planeIndex); + + static VkFormatFeatureFlags GetExtendedFeatureFlags( + const PhysicalDevice* pPhysicalDevice, + VkFormat format, + VkImageTiling tiling, + const RuntimeSettings& settings); }; #define VK_EXT_4444_FORMAT_START VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT @@ -511,7 +523,6 @@ bool Formats::IsDvec3Or4( return needsTwoLocations; } - } // namespace vk #endif /* __VK_FORMATS_H__ */ diff --git a/icd/api/include/vk_indirect_commands_layout.h b/icd/api/include/vk_indirect_commands_layout.h new file mode 100644 index 00000000..211c0d10 --- /dev/null +++ b/icd/api/include/vk_indirect_commands_layout.h @@ -0,0 +1,147 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file vk_indirect_commands_layout.h + * @brief Functionality related to Vulkan indirect commands layout objects. + *********************************************************************************************************************** + */ + +#ifndef __VK_INDIRECT_COMMANDS_LAYOUT_H__ +#define __VK_INDIRECT_COMMANDS_LAYOUT_H__ + +#pragma once + +#include "include/khronos/vulkan.h" +#include "include/vk_device.h" +#include "include/vk_dispatch.h" +#include "include/vk_pipeline_layout.h" + +#include "palIndirectCmdGenerator.h" + +namespace Pal +{ + +class IIndirectCmdGenerator; +struct IndirectCmdGeneratorCreateInfo; +struct IndirectParam; + +}; + +namespace vk +{ + +enum IndirectCommandsActionType +{ + Draw = 0, + DrawIndexed, + Dispatch, + MeshTask +}; + +struct IndirectCommandsInfo +{ + IndirectCommandsActionType actionType; +}; + + // ===================================================================================================================== + // API implementation of Vulkan indirect commands layout + // + // Indirect commands layout objects describe the information of indirect commands, as well as how to interpret and + // process indirect buffers. +class IndirectCommandsLayout final : public NonDispatchable +{ +public: + static VkResult Create( + const Device* pDevice, + const VkIndirectCommandsLayoutCreateInfoNV* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkIndirectCommandsLayoutNV* pLayout); + + void CalculateMemoryRequirements( + const Device* pDevice, + VkMemoryRequirements2* pMemoryRequirements) const; + + void BindPreprocessBuffer( + VkBuffer buffer, + VkDeviceSize memOffset, + uint32_t deviceIdx); + + VkResult Destroy( + Device* pDevice, + const VkAllocationCallbacks* pAllocator); + + const Pal::IIndirectCmdGenerator* PalIndirectCmdGenerator(uint32_t idx) const + { + return m_perGpu[idx].pGenerator; + } + + IndirectCommandsInfo GetIndirectCommandsInfo() const + { + return m_info; + } + +private: + + PAL_DISALLOW_COPY_AND_ASSIGN(IndirectCommandsLayout); + + struct PerGpuInfo + { + Pal::IIndirectCmdGenerator* pGenerator; + Pal::gpusize preprocessBufferVirtAddr; + }; + + IndirectCommandsLayout( + const Device* pDevice, + const IndirectCommandsInfo& info, + Pal::IIndirectCmdGenerator** pGenerator, + const Pal::IndirectCmdGeneratorCreateInfo& palCreateInfo); + + static size_t ObjectSize(const Device* pDevice) + { + return sizeof(IndirectCommandsLayout) + ((pDevice->NumPalDevices() - 1) * sizeof(PerGpuInfo)); + } + + static void BuildPalCreateInfo( + const Device* pDevice, + const VkIndirectCommandsLayoutCreateInfoNV* pCreateInfo, + Pal::IndirectParam* pIndirectParams, + Pal::IndirectCmdGeneratorCreateInfo* pPalCreateInfo); + + IndirectCommandsInfo m_info; + Pal::IndirectCmdGeneratorCreateInfo m_palCreateInfo; + PerGpuInfo m_perGpu[1]; +}; + +// Max usage is the situation where indirect commands layout drains push constants size plus uses indirect index & vertex +// buffer binding and ends with a draw indexed. +constexpr uint32_t MaxIndirectTokenCount = MaxPushConstRegCount + 3; +constexpr uint32_t MaxIndirectTokenOffset = MaxPushConstants + + sizeof(VkBindIndexBufferIndirectCommandNV) + + sizeof(VkBindVertexBufferIndirectCommandNV) + + sizeof(VkDrawIndexedIndirectCommand); +} // namespace vk + +#endif /* __VK_INDIRECT_COMMANDS_LAYOUT_H__ */ diff --git a/icd/api/include/vk_instance.h b/icd/api/include/vk_instance.h index ce615435..f0771fa5 100644 --- a/icd/api/include/vk_instance.h +++ b/icd/api/include/vk_instance.h @@ -60,7 +60,7 @@ namespace vk { // Forward declare classes used in this file. -class DevModeMgr; +class IDevMode; class ApiInstance; class DisplayManager; class GpuMemoryEventHandler; @@ -231,8 +231,8 @@ class Instance Pal::NullGpuId GetNullGpuId() const { return m_nullGpuId; } - DevModeMgr* GetDevModeMgr() - { return m_pDevModeMgr; } + IDevMode* GetDevModeMgr() + { return m_pDevMode; } GpuMemoryEventHandler* GetGpuMemoryEventHandler() const { return m_pGpuMemoryEventHandler; } @@ -359,7 +359,7 @@ class Instance ScreenObject m_screens[Pal::MaxScreens]; void* m_pScreenStorage; - DevModeMgr* m_pDevModeMgr; // GPUOpen Developer Mode manager. + IDevMode* m_pDevMode; // GPUOpen Developer Mode manager. static const size_t APP_INFO_MAX_CHARS = 256; char m_applicationName[APP_INFO_MAX_CHARS]; diff --git a/icd/api/include/vk_physical_device_manager.h b/icd/api/include/vk_physical_device_manager.h index f92229fe..019c165a 100644 --- a/icd/api/include/vk_physical_device_manager.h +++ b/icd/api/include/vk_physical_device_manager.h @@ -50,6 +50,7 @@ namespace vk // Forward declare Vulkan classes used in this file. class Instance; class PhysicalDevice; +class ExperimentsLoader; class PhysicalDeviceManager { @@ -111,6 +112,7 @@ class PhysicalDeviceManager Util::Mutex m_devicesLock; // Mutex used to lock access to the vector of physical devices VkPhysicalDeviceProperties* m_pAllNullProperties; // Physical device properties exposed when NULL_GPU=ALL + ExperimentsLoader* m_pExperimentsLoader; }; } diff --git a/icd/api/include/vk_pipeline.h b/icd/api/include/vk_pipeline.h index 81e66e97..ab8a785e 100644 --- a/icd/api/include/vk_pipeline.h +++ b/icd/api/include/vk_pipeline.h @@ -245,11 +245,11 @@ class Pipeline protected: Pipeline( - Device* const pDevice, + Device* const pDevice, #if VKI_RAY_TRACING - bool hasRayTracing, + bool hasRayTracing, #endif - VkPipelineBindPoint type); + VkPipelineBindPoint type); void Init( Pal::IPipeline** pPalPipeline, @@ -347,6 +347,14 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetPipelineExecutableInternalRepresentationsKHR uint32_t* pInternalRepresentationCount, VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations); +VKAPI_ATTR VkDeviceAddress VKAPI_CALL vkGetPipelineIndirectDeviceAddressNV( + VkDevice device, + const VkPipelineIndirectDeviceAddressInfoNV* pInfo); + +VKAPI_ATTR void VKAPI_CALL vkGetPipelineIndirectMemoryRequirementsNV( + VkDevice device, + const VkComputePipelineCreateInfo* pCreateInfo, + VkMemoryRequirements2* pMemoryRequirements); }; } // namespace vk diff --git a/icd/api/include/vk_queue.h b/icd/api/include/vk_queue.h index dbc9453c..ac3f5215 100644 --- a/icd/api/include/vk_queue.h +++ b/icd/api/include/vk_queue.h @@ -40,6 +40,7 @@ #include "include/vk_instance.h" #include "include/vk_utils.h" #include "include/virtual_stack_mgr.h" +#include "include/internal_mem_mgr.h" #include "palQueue.h" @@ -56,7 +57,7 @@ namespace vk struct CmdBufState; class CmdBufferRing; class Device; -class DevModeMgr; +class IDevMode; class ApiQueue; class Instance; class SwapChain; @@ -66,6 +67,15 @@ class SqttQueueState; class PhysicalDevice; class Memory; +#if VKI_RAY_TRACING +// Memory tracker for CPS stack memory to be freed +struct CpsMemTracker +{ + InternalMemory* pMem; + Pal::IFence* pFence; +}; +#endif + // ===================================================================================================================== // A Vulkan queue. class Queue @@ -255,10 +265,6 @@ class Queue const Pal::CmdBufInfo& cmdBufInfo, CmdBufState* pCmdBufState); - VkResult SynchronizeBackBuffer( - Memory* pMemory, - uint32_t deviceIdx); - protected: // This is a helper structure during a virtual remap (sparse bind) call to batch remaps into // as few calls as possible. @@ -353,9 +359,17 @@ class Queue const Pal::PresentSwapChainInfo* pPresentInfo); void DevModeFrameBoundary( - DevModeMgr* pDevModeMgr, + IDevMode* pDevMode, const VkFrameBoundaryEXT* pFrameBoundaryInfo); +#if VKI_RAY_TRACING + void FreeRetiredCpsStackMem(); + + Pal::IFence* GetCpsStackMem( + uint32_t deviceIdx, + uint64_t size); +#endif + Pal::IQueue* m_pPalQueues[MaxPalDevices]; Pal::IQueue* m_pPalBackupQueues[MaxPalDevices]; Pal::IQueue* m_pPalBackupTmzQueues[MaxPalDevices]; @@ -369,7 +383,7 @@ class Queue uint32_t m_queueFamilyIndex; // This queue's family index uint32_t m_queueIndex; // This queue's index within the node group uint32_t m_queueFlags; - DevModeMgr* m_pDevModeMgr; + IDevMode* m_pDevMode; VirtualStackAllocator* m_pStackAllocator; VidPnSourceFlipStatus m_flipStatus; Pal::PerSourceFrameMetadataControl m_palFrameMetadataControl; @@ -379,6 +393,15 @@ class Queue const bool m_isDeviceIndependent; +#if VKI_RAY_TRACING + InternalMemory* m_pCpsGlobalMem; + + typedef Util::List CpsMemDestroyList; + typedef Util::ListIterator CpsMemDestroyListIterator; + + CpsMemDestroyList m_cpsMemDestroyList; // list of cps stack memory to be destroyed +#endif + private: PAL_DISALLOW_COPY_AND_ASSIGN(Queue); }; diff --git a/icd/api/include/vk_swapchain.h b/icd/api/include/vk_swapchain.h index f845f7dd..f04ae65c 100644 --- a/icd/api/include/vk_swapchain.h +++ b/icd/api/include/vk_swapchain.h @@ -154,6 +154,8 @@ class SwapChain final : public NonDispatchable const Pal::ScreenColorConfig& GetColorParams() const { return m_colorParams; } + bool IsFullscreenOrEfsePresent() const; + Pal::IGpuMemory* UpdatePresentInfo( uint32_t deviceIdx, uint32_t imageIndex, @@ -187,9 +189,9 @@ class SwapChain final : public NonDispatchable void MarkAsDeprecated( const VkAllocationCallbacks* pAllocator); - bool IsDxgiEnabled() const + uint32_t GetVidPnSourceId() const { - return (m_properties.displayableInfo.palPlatform == Pal::WsiPlatform::Dxgi); + return m_vidPnSourceId; } bool IsSuboptimal(uint32_t deviceIdx); @@ -200,6 +202,7 @@ class SwapChain final : public NonDispatchable const Properties& properties, VkPresentModeKHR presentMode, FullscreenMgr* pFullscreenMgr, + uint32_t m_vidPnSourceId, Pal::WorkstationStereoMode wsStereoMode, Pal::ISwapChain* pPalSwapChain); @@ -224,7 +227,9 @@ class SwapChain final : public NonDispatchable uint32_t m_queueFamilyIndex; // Queue family index of the last present - Pal::WorkstationStereoMode m_wsStereoMode; // Workstation Stereo Mode + uint32_t m_vidPnSourceId; // Video present source identifier. + + Pal::WorkstationStereoMode m_wsStereoMode; // Workstation Stereo Mode Device::InternalPipeline m_pAutoStereoPipeline; // Auto Stereo shader private: @@ -263,8 +268,7 @@ class FullscreenMgr FullscreenMgr::Mode mode, Pal::IScreen* pScreen, Pal::OsDisplayHandle hDisplay, - Pal::OsWindowHandle hWindow, - uint32_t vidPnSourceId); + Pal::OsWindowHandle hWindow); ~FullscreenMgr(); @@ -283,19 +287,11 @@ class FullscreenMgr void Destroy(const VkAllocationCallbacks* pAllocator); - void UpdatePresentInfo( - SwapChain* pSwapChain, - Pal::PresentSwapChainInfo* pPresentInfo, - const Pal::FlipStatusFlags& flipFlags); - Pal::Result IsFullscreenOwnershipSafe() const; ExclusiveModeFlags GetExclusiveModeFlags() const { return m_exclusiveModeFlags; } - uint32_t GetVidPnSourceId() const - { return m_vidPnSourceId; } - Pal::IScreen* GetPalScreen() const { return m_pScreen; } @@ -323,8 +319,7 @@ class FullscreenMgr Pal::OsDisplayHandle m_hDisplay; // The monitor of the IScreen from swap chain creation Pal::OsWindowHandle m_hWindow; // The window of the swap chain - uint32_t m_vidPnSourceId; // Video present source identifier - Mode m_mode; // Indicates the Presentation mode we are using + Mode m_mode; // Indicates the Presentation mode we are using }; // ===================================================================================================================== diff --git a/icd/api/include/vk_utils.h b/icd/api/include/vk_utils.h index 8c048313..cda14997 100644 --- a/icd/api/include/vk_utils.h +++ b/icd/api/include/vk_utils.h @@ -143,14 +143,12 @@ inline uint64_t TicksToNano(uint64_t ticks) // Get driver build time hash uint32_t GetBuildTimeHash(); -#if DEBUG // ===================================================================================================================== // If turned on and exe name is a match, this function spins idle until we have a debugger hooked. void WaitIdleForDebugger(bool waitIdleToggled, const char* pWaitIdleExeName, uint32_t debugTimeout); -#endif // ===================================================================================================================== -// This function can be used to get the right externsion structure of specific type in case there are more than one +// This function can be used to get the right extension structure of specific type in case there are more than one // extension is supported inline const VkStructHeader* GetExtensionStructure(const VkStructHeader* pHeader, VkStructureType sType) { diff --git a/icd/api/pipeline_binary_cache.cpp b/icd/api/pipeline_binary_cache.cpp index 8803ae9c..b157a3b2 100644 --- a/icd/api/pipeline_binary_cache.cpp +++ b/icd/api/pipeline_binary_cache.cpp @@ -109,7 +109,7 @@ PipelineBinaryCache* PipelineBinaryCache::Create( const RuntimeSettings& settings, const char* pDefaultCacheFilePath, #if ICD_GPUOPEN_DEVMODE_BUILD - vk::DevModeMgr* pDevModeMgr, + vk::IDevMode* pDevMode, #endif uint32_t expectedEntries, size_t initDataSize, @@ -129,7 +129,7 @@ PipelineBinaryCache* PipelineBinaryCache::Create( pObj = VK_PLACEMENT_NEW(pMem) PipelineBinaryCache(pAllocationCallbacks, gfxIp, expectedEntries); #if ICD_GPUOPEN_DEVMODE_BUILD - pObj->m_pDevModeMgr = pDevModeMgr; + pObj->m_pDevMode = pDevMode; #endif if (pObj->Initialize(settings, createArchiveLayers, pDefaultCacheFilePath, pKey) != VK_SUCCESS) @@ -193,7 +193,7 @@ PipelineBinaryCache::PipelineBinaryCache( m_pPlatformKey { nullptr }, m_pTopLayer { nullptr }, #if ICD_GPUOPEN_DEVMODE_BUILD - m_pDevModeMgr { nullptr }, + m_pDevMode { nullptr }, m_pReinjectionLayer { nullptr }, m_hashMapping { 32, &m_palAllocator }, #endif @@ -530,9 +530,9 @@ void PipelineBinaryCache::FreePipelineBinary( void PipelineBinaryCache::Destroy() { #if ICD_GPUOPEN_DEVMODE_BUILD - if (m_pDevModeMgr != nullptr) + if (m_pDevMode != nullptr) { - m_pDevModeMgr->DeregisterPipelineCache(this); + m_pDevMode->DeregisterPipelineCache(this); } #endif @@ -575,7 +575,7 @@ VkResult PipelineBinaryCache::Initialize( if ((result == VK_SUCCESS) && (m_pReinjectionLayer != nullptr)) { - Util::Result palResult = m_pDevModeMgr->RegisterPipelineCache( + Util::Result palResult = m_pDevMode->RegisterPipelineCache( this, settings.devModePipelineUriServicePostSizeLimit); @@ -612,7 +612,7 @@ VkResult PipelineBinaryCache::InitReinjectionLayer( { VkResult result = VK_ERROR_FEATURE_NOT_PRESENT; - if (m_pDevModeMgr != nullptr) + if (m_pDevMode != nullptr) { Util::MemoryCacheCreateInfo info = {}; Util::AllocCallbacks allocCbs = { @@ -1080,13 +1080,17 @@ VkResult PipelineBinaryCache::InitArchiveLayers( if (settings.allowCleanUpCacheDirectory) { - uint64 totalSize = 0, oldestTime = 0; + uint64 totalSize = 0; + Util::SecondsSinceEpoch oldestTime = { }; if (Util::GetStatusOfDir(pCachePath, &totalSize, &oldestTime) == Util::Result::Success) { if (totalSize >= settings.pipelineCacheDefaultLocationLimitation) { - Util::RemoveFilesOfDirOlderThan(pCachePath, - oldestTime + settings.thresholdOfCleanUpCache); + const uint64 sec = oldestTime.time_since_epoch().count() + + settings.thresholdOfCleanUpCache; + + Util::RemoveFilesOfDirOlderThan( + pCachePath, Util::SecondsSinceEpoch { Uint64ToChronoSeconds(sec) }); } } } diff --git a/icd/api/pipeline_compiler.cpp b/icd/api/pipeline_compiler.cpp index 23dc846f..99083a4b 100644 --- a/icd/api/pipeline_compiler.cpp +++ b/icd/api/pipeline_compiler.cpp @@ -1110,15 +1110,6 @@ VkResult PipelineCompiler::CreateGraphicsPipelineBinary( } } - if (shouldCompile) - { - if ((settings.ignoreFlagFailOnPipelineCompileRequired == false) && - (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)) - { - result = VK_PIPELINE_COMPILE_REQUIRED_EXT; - } - } - if (settings.enablePipelineDump && (result == VK_SUCCESS)) { Vkgc::PipelineDumpOptions dumpOptions = {}; @@ -1181,6 +1172,9 @@ VkResult PipelineCompiler::CreateGraphicsPipelineBinary( DumpPipelineMetadata(pPipelineDumpHandle, pCreateInfo->pBinaryMetadata); } + char resultMsg[64]; + Util::Snprintf(resultMsg, sizeof(resultMsg), "\n;CompileResult=%s\n", VkResultName(result)); + Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, resultMsg); Vkgc::IPipelineDumper::EndPipelineDump(pPipelineDumpHandle); } @@ -1244,6 +1238,9 @@ VkResult PipelineCompiler::CreateGraphicsShaderBinary( if (pPipelineDumpHandle != nullptr) { + char resultMsg[64]; + Util::Snprintf(resultMsg, sizeof(resultMsg), "\n;CompileResult=%s\n", VkResultName(result)); + Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, resultMsg); Vkgc::IPipelineDumper::EndPipelineDump(pPipelineDumpHandle); } @@ -1299,12 +1296,19 @@ VkResult PipelineCompiler::CreateColorExportShaderLibrary( Vkgc::PipelineBuildInfo pipelineInfo = {}; GraphicsPipelineBuildInfo graphicsInfo = pCreateInfo->pipelineInfo; graphicsInfo.task.pModuleData = nullptr; + graphicsInfo.task.options.clientHash = {}; graphicsInfo.vs.pModuleData = nullptr; + graphicsInfo.vs.options.clientHash = {}; graphicsInfo.tcs.pModuleData = nullptr; + graphicsInfo.tcs.options.clientHash = {}; graphicsInfo.tes.pModuleData = nullptr; + graphicsInfo.tes.options.clientHash = {}; graphicsInfo.gs.pModuleData = nullptr; + graphicsInfo.gs.options.clientHash = {}; graphicsInfo.mesh.pModuleData = nullptr; + graphicsInfo.mesh.options.clientHash = {}; graphicsInfo.fs.pModuleData = nullptr; + graphicsInfo.fs.options.clientHash = {}; pipelineInfo.pGraphicsInfo = &graphicsInfo; pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump(&dumpOptions, @@ -1364,6 +1368,9 @@ VkResult PipelineCompiler::CreateColorExportShaderLibrary( if (pPipelineDumpHandle != nullptr) { + char resultMsg[64]; + Util::Snprintf(resultMsg, sizeof(resultMsg), "\n;CompileResult=%s\n", VkResultName(result)); + Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, resultMsg); Vkgc::IPipelineDumper::EndPipelineDump(pPipelineDumpHandle); } pCreateInfo->pipelineInfo.unlinked = false; @@ -1477,14 +1484,6 @@ VkResult PipelineCompiler::CreateComputePipelineBinary( } } } - if (shouldCompile) - { - if ((settings.ignoreFlagFailOnPipelineCompileRequired == false) && - (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)) - { - result = VK_PIPELINE_COMPILE_REQUIRED_EXT; - } - } if (settings.enablePipelineDump && (result == VK_SUCCESS)) { @@ -1540,6 +1539,9 @@ VkResult PipelineCompiler::CreateComputePipelineBinary( { Vkgc::IPipelineDumper::DumpPipelineBinary(pPipelineDumpHandle, m_gfxIp, pPipelineBinary); } + char resultMsg[64]; + Util::Snprintf(resultMsg, sizeof(resultMsg), "\n;CompileResult=%s\n", VkResultName(result)); + Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, resultMsg); Vkgc::IPipelineDumper::EndPipelineDump(pPipelineDumpHandle); } @@ -2090,18 +2092,6 @@ static void BuildMultisampleStateInFoi( } } -// ===================================================================================================================== -static void BuildViewportState( - const Device* pDevice, - const VkPipelineViewportStateCreateInfo* pVs, - const uint64_t dynamicStateFlags, - GraphicsPipelineBinaryCreateInfo* pCreateInfo) -{ - if (pVs != nullptr) - { - } -} - // ===================================================================================================================== void PipelineCompiler::BuildNggState( const Device* pDevice, @@ -2228,18 +2218,10 @@ void PipelineCompiler::BuildPipelineShaderInfo( pCompiler->ApplyDefaultShaderOptions(stage, pShaderInfoIn->flags, &pShaderInfoOut->options); - if (pShaderInfoIn->pModuleHandle != nullptr) - { - Pal::ShaderHash clientHash = ShaderModule::GetCodeHash( - pShaderInfoIn->pModuleHandle->codeHash, pShaderInfoIn->pEntryPoint); - pShaderInfoOut->options.clientHash.lower = clientHash.lower; - pShaderInfoOut->options.clientHash.upper = clientHash.upper; - } - else - { - pShaderInfoOut->options.clientHash.lower = pShaderInfoIn->codeHash.lower; - pShaderInfoOut->options.clientHash.upper = pShaderInfoIn->codeHash.upper; - } + + pShaderInfoOut->options.clientHash.lower = pShaderInfoIn->codeHash.lower; + pShaderInfoOut->options.clientHash.upper = pShaderInfoIn->codeHash.upper; + ApplyProfileOptions(pDevice, static_cast(stage), pPipelineOptions, @@ -2441,11 +2423,15 @@ static void BuildColorBlendState( { uint32_t location = i; - if ((extStructs.pRenderingAttachmentLocationInfo != nullptr) && - (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations != nullptr) && - (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations[i] != VK_ATTACHMENT_UNUSED)) + if ((extStructs.pRenderingAttachmentLocationInfo != nullptr) && + (extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations != nullptr)) { location = extStructs.pRenderingAttachmentLocationInfo->pColorAttachmentLocations[i]; + + if (location == VK_ATTACHMENT_UNUSED) + { + continue; + } } auto pLlpcCbDst = &pCreateInfo->pipelineInfo.cbState.target[location]; @@ -2624,11 +2610,6 @@ static void BuildPreRasterizationShaderState( BuildRasterizationState(pIn->pRasterizationState, dynamicStateFlags, &isConservativeOverestimation, pCreateInfo); - if (pCreateInfo->pipelineInfo.rsState.rasterizerDiscardEnable == false) - { - BuildViewportState(pDevice, pIn->pViewportState, dynamicStateFlags, pCreateInfo); - } - PipelineCompiler::BuildNggState( pDevice, activeStages, isConservativeOverestimation, unrestrictedPrimitiveTopology, pCreateInfo); @@ -3681,7 +3662,8 @@ void PipelineCompiler::FreeGraphicsPipelineCreateInfo( pCreateInfo->pTempBuffer = nullptr; } - if (pCreateInfo->pBinaryMetadata->internalBufferInfo.pData != nullptr) + if ((pCreateInfo->pBinaryMetadata != nullptr) && + (pCreateInfo->pBinaryMetadata->internalBufferInfo.pData != nullptr)) { pInstance->FreeMem(pCreateInfo->pBinaryMetadata->internalBufferInfo.pData); pCreateInfo->pBinaryMetadata->internalBufferInfo.pData = nullptr; @@ -4074,15 +4056,6 @@ VkResult PipelineCompiler::CreateRayTracingPipelineBinary( bool shaderModuleReplaced = false; - if (shouldCompile) - { - if ((settings.ignoreFlagFailOnPipelineCompileRequired == false) && - (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)) - { - result = VK_PIPELINE_COMPILE_REQUIRED_EXT; - } - } - if (settings.enablePipelineDump && (result == VK_SUCCESS)) { Vkgc::PipelineDumpOptions dumpOptions = {}; @@ -4236,6 +4209,9 @@ VkResult PipelineCompiler::CreateRayTracingPipelineBinary( } } + char resultMsg[64]; + Util::Snprintf(resultMsg, sizeof(resultMsg), "\n;CompileResult=%s\n", VkResultName(result)); + Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, resultMsg); Vkgc::IPipelineDumper::EndPipelineDump(pPipelineDumpHandle); } @@ -5599,6 +5575,48 @@ void PipelineCompiler::DumpPipelineMetadata( } } +// ===================================================================================================================== +void PipelineCompiler::DumpPipeline( + const RuntimeSettings& settings, + const Vkgc::PipelineBuildInfo& pipelineInfo, + uint64_t apiPsoHash, + uint32_t binaryCount, + const Vkgc::BinaryData* pElfBinaries, + VkResult result) +{ + Vkgc::PipelineDumpOptions dumpOptions = {}; + dumpOptions.pDumpDir = settings.pipelineDumpDir; + dumpOptions.filterPipelineDumpByType = settings.filterPipelineDumpByType; + dumpOptions.filterPipelineDumpByHash = settings.filterPipelineDumpByHash; + dumpOptions.dumpDuplicatePipelines = settings.dumpDuplicatePipelines; + + void* pPipelineDumpHandle = nullptr; + if (settings.dumpPipelineWithApiHash) + { + pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump( + &dumpOptions, pipelineInfo, apiPsoHash); + } + else + { + pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump( + &dumpOptions, pipelineInfo); + } + + for (uint32_t i = 0; i < binaryCount; i++) + { + if (pElfBinaries[i].codeSize > 0 && pElfBinaries[i].pCode != nullptr) + { + Vkgc::IPipelineDumper::DumpPipelineBinary( + pPipelineDumpHandle, m_gfxIp, &pElfBinaries[i]); + } + } + + char resultMsg[64]; + Util::Snprintf(resultMsg, sizeof(resultMsg), "\n;CompileResult=%s\n", VkResultName(result)); + Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, resultMsg); + Vkgc::IPipelineDumper::EndPipelineDump(pPipelineDumpHandle); +} + // ===================================================================================================================== // Template instantiation needed for references in other files. Linux complains if we don't do this. diff --git a/icd/api/raytrace/ray_tracing_device.cpp b/icd/api/raytrace/ray_tracing_device.cpp index e9a47178..17b93902 100644 --- a/icd/api/raytrace/ray_tracing_device.cpp +++ b/icd/api/raytrace/ray_tracing_device.cpp @@ -182,6 +182,7 @@ void RayTracingDevice::CreateGpuRtDeviceSettings( pDeviceSettings->fp16BoxModeMixedSaThresh = Util::Clamp(fp16BoxMixedThreshold, 1.0f, 8.0f); pDeviceSettings->enableMortonCode30 = settings.rtEnableMortonCode30; pDeviceSettings->enableVariableBitsMortonCodes = settings.enableVariableBitsMortonCodes; + pDeviceSettings->enableFastLBVH = settings.rtEnableFastLbvh; pDeviceSettings->enablePrefixScanDLB = settings.rtEnablePrefixScanDlb; switch (settings.rtTriangleCompressionMode) @@ -214,8 +215,12 @@ void RayTracingDevice::CreateGpuRtDeviceSettings( pDeviceSettings->bvhCpuBuildModeFastTrace = static_cast(settings.rtBvhCpuBuildMode); pDeviceSettings->bvhCpuBuildModeDefault = static_cast(settings.rtBvhCpuBuildMode); pDeviceSettings->bvhCpuBuildModeFastBuild = static_cast(settings.rtBvhCpuBuildMode); + pDeviceSettings->enableTriangleSplitting = settings.rtEnableTriangleSplitting; pDeviceSettings->triangleSplittingFactor = settings.rtTriangleSplittingFactor; + pDeviceSettings->tsBudgetPerTriangle = settings.rtTriangleSplittingBudgetPerTriangle; + pDeviceSettings->tsPriority = settings.rtTriangleSplittingPriority; + pDeviceSettings->enableFusedInstanceNode = settings.enableFusedInstanceNode; pDeviceSettings->rebraidFactor = settings.rebraidFactor; pDeviceSettings->rebraidLengthPercentage = settings.rebraidLengthPercentage; @@ -232,9 +237,6 @@ void RayTracingDevice::CreateGpuRtDeviceSettings( pDeviceSettings->numMortonSizeBits = settings.numMortonSizeBits; pDeviceSettings->allowFp16BoxNodesInUpdatableBvh = settings.rtAllowFp16BoxNodesInUpdatableBvh; - pDeviceSettings->enableBuildAccelStructScratchDumping = pDeviceSettings->enableBuildAccelStructDumping && - settings.rtEnableAccelerationStructureScratchMemoryDump; - // Enable AS stats based on panel setting pDeviceSettings->enableBuildAccelStructStats = settings.rtEnableBuildAccelStructStats; // Number of Rebraid Iterations and rebraid Quality Heuristics @@ -305,9 +307,10 @@ bool RayTracingDevice::AccelStructTrackerEnabled( uint32_t deviceIdx ) const { - return (GetAccelStructTracker(deviceIdx) != nullptr) && - (m_pDevice->GetRuntimeSettings().enableTraceRayAccelStructTracking || - m_pGpuRtDevice[deviceIdx]->AccelStructTraceEnabled()); + + // Enable tracking when forced on in the panel or the GPURT trace source is enabled. + return ((GetAccelStructTracker(deviceIdx) != nullptr) && ( + m_pGpuRtDevice[deviceIdx]->AccelStructTraceEnabled())); } // ===================================================================================================================== @@ -1012,7 +1015,7 @@ Pal::Result RayTracingDevice::ClientFlushCmdContext( if (result == Pal::Result::Success) { - result = pCmdContext->pDevice->WaitForFences(1, &pCmdContext->pFence, true, UINT64_MAX); + result = pCmdContext->pDevice->WaitForFences(1, &pCmdContext->pFence, true, std::chrono::nanoseconds::max()); } return result; @@ -1128,15 +1131,13 @@ Pal::Result RayTracingDevice::ClientGetTemporaryGpuMemory( VK_ASSERT(pCmdbuf != nullptr); vk::Device* pDevice = pCmdbuf->VkDevice(); - for (uint32_t deviceIdx = 0; - pDevice->NumPalDevices(); - ++deviceIdx) + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); ++deviceIdx) { if (pCmdbuf->PalCmdBuffer(deviceIdx) != pPalCmdbuf) continue; InternalMemory* pVidMem = nullptr; - if (pCmdbuf->GetScratchVidMem(sizeInBytes, InternalPoolGpuReadOnlyCpuVisible, &pVidMem) == VK_SUCCESS) + if (pCmdbuf->GetScratchVidMem(sizeInBytes, InternalPoolDescriptorTable, &pVidMem) == VK_SUCCESS) { if (pVidMem != nullptr) { diff --git a/icd/api/raytrace/vk_ray_tracing_pipeline.cpp b/icd/api/raytrace/vk_ray_tracing_pipeline.cpp index a7425fad..d5ab2665 100644 --- a/icd/api/raytrace/vk_ray_tracing_pipeline.cpp +++ b/icd/api/raytrace/vk_ray_tracing_pipeline.cpp @@ -613,7 +613,7 @@ VkResult RayTracingPipeline::CreateImpl( const uint32_t totalGroupCount = pipelineCreateInfo.groupCount + pipelineLibGroupCount; - RayTracingPipelineBinary pipelineBinary[MaxPalDevices] = {}; + RayTracingPipelineBinary pipelineBinaries[MaxPalDevices] = {}; Vkgc::RayTracingShaderIdentifier* pShaderGroups [MaxPalDevices] = {}; BinaryData librarySummaries[MaxPalDevices] = {}; @@ -636,8 +636,8 @@ VkResult RayTracingPipeline::CreateImpl( { if (pipelineCreateInfo.groupCount > 0) { - pipelineBinary[0].shaderGroupHandle.shaderHandles = pShaderGroups[0]; - pipelineBinary[0].shaderGroupHandle.shaderHandleCount = pipelineCreateInfo.groupCount; + pipelineBinaries[0].shaderGroupHandle.shaderHandles = pShaderGroups[0]; + pipelineBinaries[0].shaderGroupHandle.shaderHandleCount = pipelineCreateInfo.groupCount; } for (uint32_t deviceIdx = 1; deviceIdx < m_pDevice->NumPalDevices(); ++deviceIdx) @@ -645,8 +645,8 @@ VkResult RayTracingPipeline::CreateImpl( pShaderGroups[deviceIdx] = pShaderGroups[deviceIdx - 1] + totalGroupCount; if (pipelineCreateInfo.groupCount > 0) { - pipelineBinary[deviceIdx].shaderGroupHandle.shaderHandles = pShaderGroups[deviceIdx]; - pipelineBinary[deviceIdx].shaderGroupHandle.shaderHandleCount = pipelineCreateInfo.groupCount; + pipelineBinaries[deviceIdx].shaderGroupHandle.shaderHandles = pShaderGroups[deviceIdx]; + pipelineBinaries[deviceIdx].shaderGroupHandle.shaderHandleCount = pipelineCreateInfo.groupCount; } } } @@ -668,11 +668,11 @@ VkResult RayTracingPipeline::CreateImpl( nullptr, utils::PlacementElement{ - &pipelineBinary[0].shaderPropSet.shaderProps, + &pipelineBinaries[0].shaderPropSet.shaderProps, maxFunctionCount * m_pDevice->NumPalDevices()}, utils::PlacementElement{ - &pipelineBinary[0].pPipelineBins, + &pipelineBinaries[0].pPipelineBins, maxPipelineBinaryCount * m_pDevice->NumPalDevices()}, utils::PlacementElement{&pIndirectFuncInfo, maxFunctionCount}, @@ -691,13 +691,13 @@ VkResult RayTracingPipeline::CreateImpl( memset(pTempBuffer, 0, placement.SizeOf()); placement.FixupPtrs(pTempBuffer); - pipelineBinary[0].shaderPropSet.shaderCount = maxFunctionCount; - pipelineBinary[0].pipelineBinCount = maxPipelineBinaryCount; + pipelineBinaries[0].shaderPropSet.shaderCount = maxFunctionCount; + pipelineBinaries[0].pipelineBinCount = maxPipelineBinaryCount; for (uint32_t deviceIdx = 1; deviceIdx < m_pDevice->NumPalDevices(); ++deviceIdx) { - const auto pBinary = &pipelineBinary[deviceIdx]; - const auto& prevBinary = pipelineBinary[deviceIdx - 1]; + const auto pBinary = &pipelineBinaries[deviceIdx]; + const auto& prevBinary = pipelineBinaries[deviceIdx - 1]; pBinary->pipelineBinCount = maxPipelineBinaryCount; pBinary->pPipelineBins = prevBinary.pPipelineBins + maxPipelineBinaryCount; @@ -733,7 +733,7 @@ VkResult RayTracingPipeline::CreateImpl( &cacheId[deviceIdx] ); - bool forceCompilation = m_pDevice->GetRuntimeSettings().enablePipelineDump; + bool forceCompilation = false; if (forceCompilation == false) { Vkgc::BinaryData cachedBinData = {}; @@ -761,31 +761,75 @@ VkResult RayTracingPipeline::CreateImpl( // Unpack the cached blob into separate binaries. pDefaultCompiler->ExtractRayTracingPipelineBinary( &cachedBinData, - &pipelineBinary[deviceIdx]); + &pipelineBinaries[deviceIdx]); } } - // Compile if unable to retrieve from cache. if (cacheResult != Util::Result::Success) { - result = pDefaultCompiler->ConvertRayTracingPipelineInfo( + if ((settings.ignoreFlagFailOnPipelineCompileRequired == false) && + (flags & VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR)) + { + result = VK_PIPELINE_COMPILE_REQUIRED_EXT; + } + } + + bool shouldConvert = (pCreateInfo != nullptr) && + (settings.enablePipelineDump || (cacheResult != Util::Result::Success)); + + VkResult convertResult = VK_ERROR_UNKNOWN; + if (shouldConvert) + { + convertResult = pDefaultCompiler->ConvertRayTracingPipelineInfo( m_pDevice, &pipelineCreateInfo, flags, &shaderInfo, &optimizerKey, &binaryCreateInfo); + result = (result == VK_SUCCESS) ? convertResult : result; + } - if (result == VK_SUCCESS) + if ((result == VK_SUCCESS) && + (convertResult == VK_SUCCESS) && + (cacheResult != Util::Result::Success)) + { + for (uint32_t i = 0; i < binaryCreateInfo.pipelineInfo.shaderCount; i++) { - result = pDefaultCompiler->CreateRayTracingPipelineBinary( - m_pDevice, - deviceIdx, - pPipelineCache, - &binaryCreateInfo, - &pipelineBinary[deviceIdx], - &cacheId[deviceIdx]); + if (IsShaderModuleIdentifier(binaryCreateInfo.pipelineInfo.pShaders[i])) + { + result = VK_ERROR_UNKNOWN; + break; + } + } + } + + if (settings.enablePipelineDump && (convertResult == VK_SUCCESS)) + { + if ((cacheResult == Util::Result::Success) || (result != VK_SUCCESS)) + { + Vkgc::PipelineBuildInfo pipelineInfo = {}; + pipelineInfo.pRayTracingInfo = &binaryCreateInfo.pipelineInfo; + pDefaultCompiler->DumpPipeline( + m_pDevice->GetRuntimeSettings(), + pipelineInfo, + binaryCreateInfo.apiPsoHash, + pipelineBinaries[deviceIdx].pipelineBinCount, + pipelineBinaries[deviceIdx].pPipelineBins, + result); } + } + + // Compile if unable to retrieve from cache. + if ((result == VK_SUCCESS) && (cacheResult != Util::Result::Success)) + { + result = pDefaultCompiler->CreateRayTracingPipelineBinary( + m_pDevice, + deviceIdx, + pPipelineCache, + &binaryCreateInfo, + &pipelineBinaries[deviceIdx], + &cacheId[deviceIdx]); // Add the pipeline to any cache layer where it's missing. if (result == VK_SUCCESS) @@ -794,7 +838,7 @@ VkResult RayTracingPipeline::CreateImpl( // Join the binaries into a single blob. pDefaultCompiler->BuildRayTracingPipelineBinary( - &pipelineBinary[deviceIdx], + &pipelineBinaries[deviceIdx], &cachedBinData); if (cachedBinData.pCode != nullptr) @@ -814,7 +858,7 @@ VkResult RayTracingPipeline::CreateImpl( if (totalGroupCount > 0) { // Copy shader groups if compiler doesn't use pre-allocated buffer. - const auto& groupHandle = pipelineBinary[deviceIdx].shaderGroupHandle; + const auto& groupHandle = pipelineBinaries[deviceIdx].shaderGroupHandle; if (groupHandle.shaderHandles != pShaderGroups[deviceIdx]) { memcpy( @@ -825,13 +869,13 @@ VkResult RayTracingPipeline::CreateImpl( } } - m_hasTraceRay = pipelineBinary[DefaultDeviceIndex].hasTraceRay; + m_hasTraceRay = pipelineBinaries[DefaultDeviceIndex].hasTraceRay; uint32_t funcCount = 0; if (result == VK_SUCCESS) { - const auto pShaderProp = &pipelineBinary[DefaultDeviceIndex].shaderPropSet.shaderProps[0]; - const uint32_t shaderCount = pipelineBinary[DefaultDeviceIndex].shaderPropSet.shaderCount; + const auto pShaderProp = &pipelineBinaries[DefaultDeviceIndex].shaderPropSet.shaderProps[0]; + const uint32_t shaderCount = pipelineBinaries[DefaultDeviceIndex].shaderPropSet.shaderCount; for (uint32_t i = 0; i < shaderCount; i++) { if (pShaderProp[i].shaderId != RayTracingInvalidShaderId) @@ -854,7 +898,7 @@ VkResult RayTracingPipeline::CreateImpl( for (uint32_t deviceIdx = 0; deviceIdx != MaxPalDevices; ++deviceIdx) { - const auto& librarySummary = pipelineBinary[deviceIdx].librarySummary; + const auto& librarySummary = pipelineBinaries[deviceIdx].librarySummary; totalLibrarySummariesSize += Pow2Align(librarySummary.codeSize, 8); } @@ -875,7 +919,7 @@ VkResult RayTracingPipeline::CreateImpl( for (uint32_t deviceIdx = 0; deviceIdx != MaxPalDevices; ++deviceIdx) { - const auto& librarySummary = pipelineBinary[deviceIdx].librarySummary; + const auto& librarySummary = pipelineBinaries[deviceIdx].librarySummary; librarySummaries[deviceIdx].pCode = VoidPtrInc(pBuffer, offset); librarySummaries[deviceIdx].codeSize = librarySummary.codeSize; memcpy(VoidPtrInc(pBuffer, offset), librarySummary.pCode, librarySummary.codeSize); @@ -969,7 +1013,7 @@ VkResult RayTracingPipeline::CreateImpl( ((deviceIdx < m_pDevice->NumPalDevices()) && (palResult == Pal::Result::Success)); deviceIdx++) { - const auto pBinaries = pipelineBinary[deviceIdx].pPipelineBins; + const auto pBinaries = pipelineBinaries[deviceIdx].pPipelineBins; const auto ppDeviceShaderLibraries = ppShaderLibraries + deviceIdx * funcCount; void* pDeviceShaderLibraryMem = Util::VoidPtrInc(pPalShaderLibraryMem, deviceIdx * funcCount * shaderLibrarySize); @@ -984,14 +1028,14 @@ VkResult RayTracingPipeline::CreateImpl( localPipelineInfo.pipeline.flags.clientInternal = false; localPipelineInfo.pipeline.pipelineBinarySize = pBinaries[0].codeSize; localPipelineInfo.pipeline.pPipelineBinary = pBinaries[0].pCode; - localPipelineInfo.pipeline.maxFunctionCallDepth = pipelineBinary[deviceIdx].maxFunctionCallDepth; + localPipelineInfo.pipeline.maxFunctionCallDepth = pipelineBinaries[deviceIdx].maxFunctionCallDepth; } // Copy indirect function info uint32_t funcIndex = 0; - const auto pShaderProp = &pipelineBinary[deviceIdx].shaderPropSet.shaderProps[0]; - const uint32_t traceRayShaderIndex = pipelineBinary[deviceIdx].shaderPropSet.traceRayIndex; - const uint32_t shaderCount = pipelineBinary[deviceIdx].shaderPropSet.shaderCount; + const auto pShaderProp = &pipelineBinaries[deviceIdx].shaderPropSet.shaderProps[0]; + const uint32_t traceRayShaderIndex = pipelineBinaries[deviceIdx].shaderPropSet.traceRayIndex; + const uint32_t shaderCount = pipelineBinaries[deviceIdx].shaderPropSet.shaderCount; for (uint32_t i = 0; i < shaderCount; i++) { @@ -1251,10 +1295,10 @@ VkResult RayTracingPipeline::CreateImpl( const auto pPipelineLibShaderGroups = pPipelineLib->GetShaderGroupHandles(deviceIdx); const auto pLibGroupInfos = pPipelineLib->GetShaderGroupInfos(); - // update pipelineLibHasTraceRay and pipelineLibTraceRayVa - pipelineHasTraceRay = pPipelineLib->CheckHasTraceRay(); - if (pipelineHasTraceRay) + // update pipelineHasTraceRay and pipelineLibTraceRayVa + if (pPipelineLib->CheckHasTraceRay()) { + pipelineHasTraceRay = true; pipelineLibTraceRayVa = pPipelineLib->GetTraceRayGpuVa(deviceIdx); } @@ -1370,7 +1414,9 @@ VkResult RayTracingPipeline::CreateImpl( if (funcCount > 0) { const auto traceRayFuncIndex = funcCount - 1; - traceRayGpuVas[deviceIdx] = pIndirectFuncInfo[traceRayFuncIndex].gpuVirtAddr; + traceRayGpuVas[deviceIdx] = + pIndirectFuncInfo[traceRayFuncIndex].gpuVirtAddr | + pShaderProp[traceRayFuncIndex].shaderIdExtraBits; } else if (pipelineHasTraceRay) { @@ -1453,12 +1499,12 @@ VkResult RayTracingPipeline::CreateImpl( if (settings.enableDebugPrintf) { ClearFormatString(); - for (uint32_t i = 0; i < pipelineBinary[DefaultDeviceIndex].pipelineBinCount; ++i) + for (uint32_t i = 0; i < pipelineBinaries[DefaultDeviceIndex].pipelineBinCount; ++i) { DebugPrintf::DecodeFormatStringsFromElf( m_pDevice, - pipelineBinary[DefaultDeviceIndex].pPipelineBins[i].codeSize, - static_cast(pipelineBinary[DefaultDeviceIndex].pPipelineBins[i].pCode), + pipelineBinaries[DefaultDeviceIndex].pPipelineBins[i].codeSize, + static_cast(pipelineBinaries[DefaultDeviceIndex].pPipelineBins[i].pCode), GetFormatStrings()); } } @@ -1502,7 +1548,7 @@ VkResult RayTracingPipeline::CreateImpl( { m_pDevice->GetCompiler(deviceIdx)->FreeRayTracingPipelineBinary( &binaryCreateInfo, - &pipelineBinary[deviceIdx]); + &pipelineBinaries[deviceIdx]); } pAllocator->pfnFree(pAllocator->pUserData, pTempBuffer); @@ -1552,70 +1598,71 @@ static int32_t DeferredCreateRayTracingPipelineCallback( { case DeferredCallbackType::Join: { - uint32_t index = Util::AtomicIncrement(&pState->nextPending) - 1; - - const bool firstThread = (index == 0); - - // Run in a loop until we've processed all pipeline create infos. Parallel joins in their own loops can - // consume iterations. A single "main" thread per pipeline is sent out here. These threads will not return - // untill the pipeline has been fully created (unlike the helper worker threads). - while (index < pState->infoCount) + if (pState->nextPending < pState->infoCount) { - VkResult localResult = VK_SUCCESS; - const VkRayTracingPipelineCreateInfoKHR* pCreateInfo = &pState->pInfos[index]; - VkPipelineCreateFlags2KHR flags = - Device::GetPipelineCreateFlags(pCreateInfo); + uint32_t index = Util::AtomicIncrement(&pState->nextPending) - 1; - if (pState->skipRemaining == VK_FALSE) + // Run in a loop until we've processed all pipeline create infos. Parallel joins in their own loops can + // consume iterations. A single "main" thread per pipeline is sent out here. These threads will not return + // untill the pipeline has been fully created (unlike the helper worker threads). + while (index < pState->infoCount) { - RayTracingPipeline* pPipeline = RayTracingPipeline::ObjectFromHandle(pState->pPipelines[index]); - - localResult = pPipeline->CreateImpl(pState->pPipelineCache, - pCreateInfo, - flags, - pState->pAllocator, - pOperation->Workload(index)); + VkResult localResult = VK_SUCCESS; + const VkRayTracingPipelineCreateInfoKHR* pCreateInfo = &pState->pInfos[index]; + VkPipelineCreateFlags2KHR flags = + Device::GetPipelineCreateFlags(pCreateInfo); -#if ICD_GPUOPEN_DEVMODE_BUILD - if (localResult == VK_SUCCESS) + if (pState->skipRemaining == VK_FALSE) { - DevModeMgr* pDevMgr = pDevice->VkInstance()->GetDevModeMgr(); + RayTracingPipeline* pPipeline = RayTracingPipeline::ObjectFromHandle(pState->pPipelines[index]); + + localResult = pPipeline->CreateImpl(pState->pPipelineCache, + pCreateInfo, + flags, + pState->pAllocator, + pOperation->Workload(index)); - if (pDevMgr != nullptr) +#if ICD_GPUOPEN_DEVMODE_BUILD + if (localResult == VK_SUCCESS) { - pDevMgr->PipelineCreated(pDevice, pPipeline); + IDevMode* pDevMode = pDevice->VkInstance()->GetDevModeMgr(); - if (pPipeline->IsInlinedShaderEnabled() == false) + if (pDevMode != nullptr) { - pDevMgr->ShaderLibrariesCreated(pDevice, pPipeline); + pDevMode->PipelineCreated(pDevice, pPipeline); + + if (pPipeline->IsInlinedShaderEnabled() == false) + { + pDevMode->ShaderLibrariesCreated(pDevice, pPipeline); + } } } - } #endif - } - - if (localResult != VK_SUCCESS) - { - Util::AtomicCompareAndSwap(&pState->finalResult, - static_cast(VK_SUCCESS), - static_cast(localResult)); + } - if (pCreateInfo->flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT) + if (localResult != VK_SUCCESS) { - Util::AtomicCompareAndSwap(&pState->skipRemaining, - VK_FALSE, - VK_TRUE); + Util::AtomicCompareAndSwap(&pState->finalResult, + static_cast(VK_SUCCESS), + static_cast(localResult)); + + if (pCreateInfo->flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT) + { + Util::AtomicCompareAndSwap(&pState->skipRemaining, + VK_FALSE, + VK_TRUE); + } } - } - // If the workloads for this pipeline are still pending (after creation), then no-op them at this point - Util::AtomicCompareAndSwap(&pOperation->Workload(index)->totalInstances, - UINT_MAX, - 0); + // If the workloads for this pipeline are still pending (after creation), then no-op them at this point + Util::AtomicCompareAndSwap(&pOperation->Workload(index)->totalInstances, + UINT_MAX, + 0); - Util::AtomicIncrement(&pState->completed); + Util::AtomicIncrement(&pState->completed); - index = Util::AtomicIncrement(&pState->nextPending) - 1; + index = Util::AtomicIncrement(&pState->nextPending) - 1; + } } // Helper worker threads go through here. They assist the main pipeline threads. Currently, the only workloads diff --git a/icd/api/renderpass/renderpass_builder.cpp b/icd/api/renderpass/renderpass_builder.cpp index dec85cf6..8d23e42a 100644 --- a/icd/api/renderpass/renderpass_builder.cpp +++ b/icd/api/renderpass/renderpass_builder.cpp @@ -921,7 +921,9 @@ static void IncludeWaitPoint( } // ===================================================================================================================== -static void ConvertImplicitSyncs(RPBarrierInfo* pBarrier) +static void ConvertImplicitSyncsLegacy( + RPBarrierInfo* pBarrier, + const RuntimeSettings& settings) { pBarrier->implicitSrcCacheMask = 0; pBarrier->implicitDstCacheMask = 0; @@ -934,9 +936,6 @@ static void ConvertImplicitSyncs(RPBarrierInfo* pBarrier) IncludePipePoint(pBarrier, Pal::HwPipeBottom); IncludeWaitPoint(pBarrier, Pal::HwPipePreBlt); - pBarrier->srcStageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR; - pBarrier->dstStageMask |= VK_PIPELINE_STAGE_2_RESOLVE_BIT_KHR; - pBarrier->implicitSrcCacheMask |= pBarrier->flags.preColorResolveSync ? Pal::CoherColorTarget : Pal::CoherDepthStencilTarget; @@ -950,8 +949,6 @@ static void ConvertImplicitSyncs(RPBarrierInfo* pBarrier) { IncludeWaitPoint(pBarrier, Pal::HwPipePreBlt); - pBarrier->dstStageMask |= VK_PIPELINE_STAGE_2_CLEAR_BIT_KHR; - pBarrier->implicitDstCacheMask |= Pal::CoherClear; } @@ -961,7 +958,57 @@ static void ConvertImplicitSyncs(RPBarrierInfo* pBarrier) IncludePipePoint(pBarrier, Pal::HwPipePostBlt); IncludeWaitPoint(pBarrier, Pal::HwPipeTop); - // Just going by the above wait point, the dstStageMask would be converted to TopOfPipe, but it is not optimal. + pBarrier->implicitSrcCacheMask |= Pal::CoherResolveSrc; + } + + if (pBarrier->flags.implicitExternalOutgoing && + (pBarrier->pipePointCount < (MaxHwPipePoints - 1)) && + settings.implicitExternalSynchronization) + { + // Since there is no handling of implicitExternalIncoming today, make this visible immediately. + IncludeWaitPoint(pBarrier, Pal::HwPipeTop); + + pBarrier->pipePoints[pBarrier->pipePointCount] = Pal::HwPipeBottom; + pBarrier->pipePointCount++; + + pBarrier->srcAccessMask |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + } +} + +// ===================================================================================================================== +static void ConvertImplicitSyncs( + RPBarrierInfo* pBarrier, + const RuntimeSettings& settings) +{ + pBarrier->implicitSrcCacheMask = 0; + pBarrier->implicitDstCacheMask = 0; + + // Similarly augment the waiting if we need to wait for prior color rendering to finish + if (pBarrier->flags.preColorResolveSync || + pBarrier->flags.preDsResolveSync) + { + pBarrier->srcStageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR; + pBarrier->dstStageMask |= VK_PIPELINE_STAGE_2_RESOLVE_BIT_KHR; + + pBarrier->implicitSrcCacheMask |= pBarrier->flags.preColorResolveSync ? Pal::CoherColorTarget : + Pal::CoherDepthStencilTarget; + pBarrier->implicitDstCacheMask |= Pal::CoherResolveDst; + } + + // Wait for (non-auto-synced) pre-clear if necessary. No need to augment the pipe point because the prior work falls + // under subpass dependency, but we may need to move the wait point forward to cover blts. + if (pBarrier->flags.preColorClearSync || + pBarrier->flags.preDsClearSync) + { + pBarrier->dstStageMask |= VK_PIPELINE_STAGE_2_CLEAR_BIT_KHR; + + pBarrier->implicitDstCacheMask |= Pal::CoherClear; + } + + // Augment the active source pipeline stages for resolves if we need to wait for prior resolves to complete + if (pBarrier->flags.postResolveSync) + { // TopOfPipe causes a stall at PFP which is not really needed for images. As an optimization for Acq-Rel // barriers we instead set dstStage to Blt here. pBarrier->srcStageMask |= VK_PIPELINE_STAGE_2_RESOLVE_BIT_KHR; @@ -969,6 +1016,17 @@ static void ConvertImplicitSyncs(RPBarrierInfo* pBarrier) pBarrier->implicitSrcCacheMask |= Pal::CoherResolveSrc; } + + if (pBarrier->flags.implicitExternalOutgoing && + (pBarrier->pipePointCount < (MaxHwPipePoints - 1)) && + settings.implicitExternalSynchronization) + { + pBarrier->srcStageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR; + pBarrier->dstStageMask |= VK_PIPELINE_STAGE_2_BLIT_BIT_KHR; + + pBarrier->srcAccessMask |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + } } // ===================================================================================================================== @@ -977,35 +1035,14 @@ static void ConvertImplicitSyncs(RPBarrierInfo* pBarrier) void RenderPassBuilder::PostProcessSyncPoint( SyncPointState* pSyncPoint) { - // Convert subpass dependency execution scope to PAL pipe/wait point - pSyncPoint->barrier.waitPoint = VkToPalWaitPipePoint(pSyncPoint->barrier.dstStageMask); - - pSyncPoint->barrier.pipePointCount = VkToPalSrcPipePoints(pSyncPoint->barrier.srcStageMask, - pSyncPoint->barrier.pipePoints); - - // Include implicit waiting and cache access - ConvertImplicitSyncs(&pSyncPoint->barrier); - - if (pSyncPoint->barrier.flags.implicitExternalOutgoing && - (pSyncPoint->barrier.pipePointCount < (MaxHwPipePoints - 1)) && - m_pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetRuntimeSettings().implicitExternalSynchronization) - { - // Since there is no handling of implicitExternalIncoming today, make this visible immediately. - IncludeWaitPoint(&pSyncPoint->barrier, Pal::HwPipeTop); - - pSyncPoint->barrier.pipePoints[pSyncPoint->barrier.pipePointCount] = Pal::HwPipeBottom; - pSyncPoint->barrier.pipePointCount++; - - pSyncPoint->barrier.srcStageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR; - pSyncPoint->barrier.dstStageMask |= VK_PIPELINE_STAGE_2_BLIT_BIT_KHR; - - pSyncPoint->barrier.srcAccessMask |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; - } + const RuntimeSettings& settings = m_pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetRuntimeSettings(); if (m_pDevice->GetPalProperties().gfxipProperties.flags.supportReleaseAcquireInterface && - m_pDevice->GetRuntimeSettings().useAcquireReleaseInterface) + settings.useAcquireReleaseInterface) { + // Include implicit waiting and cache access + ConvertImplicitSyncs(&pSyncPoint->barrier, settings); + // Need a global cache transition if any of the sync flags are set or if there's an app // subpass dependency that requires cache synchronization. if (((pSyncPoint->barrier.srcAccessMask != 0) || @@ -1074,6 +1111,15 @@ void RenderPassBuilder::PostProcessSyncPoint( } else { + // Convert subpass dependency execution scope to PAL pipe/wait point + pSyncPoint->barrier.waitPoint = VkToPalWaitPipePoint(pSyncPoint->barrier.dstStageMask); + + pSyncPoint->barrier.pipePointCount = VkToPalSrcPipePoints(pSyncPoint->barrier.srcStageMask, + pSyncPoint->barrier.pipePoints); + + // Include implicit waiting and cache access + ConvertImplicitSyncsLegacy(&pSyncPoint->barrier, settings); + // Need a global cache transition if any of the sync flags are set or if there's an app // subpass dependency that requires cache synchronization. if ((pSyncPoint->barrier.srcAccessMask != 0) || diff --git a/icd/api/renderpass/renderpass_types.h b/icd/api/renderpass/renderpass_types.h index a485e618..6c7d4a92 100644 --- a/icd/api/renderpass/renderpass_types.h +++ b/icd/api/renderpass/renderpass_types.h @@ -109,8 +109,10 @@ struct RPBindTargetsInfo struct RPBarrierInfo { // The following fields are a composite of all VkSubpassDependencies that affect this particular barrier: - PipelineStageFlags srcStageMask; - PipelineStageFlags dstStageMask; + PipelineStageFlags srcStageMask; // VK-srcStageMask. This will be converted to appropriate Pal + // source stage mask when passing barrier info to PAL + PipelineStageFlags dstStageMask; // VK-dstStageMask. This will be converted to appropriate Pal + // dst stage mask when passing barrier info to PAL AccessFlags srcAccessMask; AccessFlags dstAccessMask; Pal::HwPipePoint waitPoint; diff --git a/icd/api/sqtt/sqtt_layer.cpp b/icd/api/sqtt/sqtt_layer.cpp index 347ef2a3..638ab2f4 100644 --- a/icd/api/sqtt/sqtt_layer.cpp +++ b/icd/api/sqtt/sqtt_layer.cpp @@ -273,14 +273,14 @@ SqttCmdBufferState::SqttCmdBufferState( : m_pCmdBuf(pCmdBuf), m_pSqttMgr(pCmdBuf->VkDevice()->GetSqttMgr()), - m_pDevModeMgr(pCmdBuf->VkDevice()->VkInstance()->GetDevModeMgr()), + m_pDevMode(pCmdBuf->VkDevice()->VkInstance()->GetDevModeMgr()), m_settings(pCmdBuf->VkDevice()->GetRuntimeSettings()), m_pNextLayer(m_pSqttMgr->GetNextLayer()), m_currentEntryPoint(RgpSqttMarkerGeneralApiType::Invalid), m_currentEventId(0), m_currentEventType(RgpSqttMarkerEventType::InternalUnknown), #if ICD_GPUOPEN_DEVMODE_BUILD - m_instructionTrace({ false, DevModeMgr::InvalidTargetPipelineHash, VK_PIPELINE_BIND_POINT_MAX_ENUM }), + m_instructionTrace({ false, IDevMode::InvalidTargetPipelineHash, VK_PIPELINE_BIND_POINT_MAX_ENUM }), #endif m_debugTags(pCmdBuf->VkInstance()->Allocator()) { @@ -319,9 +319,9 @@ void SqttCmdBufferState::Begin( m_currentEventId = 0; #if ICD_GPUOPEN_DEVMODE_BUILD - if (m_pDevModeMgr != nullptr) + if (m_pDevMode != nullptr) { - m_instructionTrace.targetHash = m_pDevModeMgr->GetInstructionTraceTargetHash(); + m_instructionTrace.targetHash = m_pDevMode->GetInstructionTraceTargetHash(); } #endif @@ -376,10 +376,10 @@ void SqttCmdBufferState::End() WriteCbEndMarker(); #if ICD_GPUOPEN_DEVMODE_BUILD - if ((m_pDevModeMgr != nullptr) && + if ((m_pDevMode != nullptr) && (m_instructionTrace.started)) { - m_pDevModeMgr->StopInstructionTrace(m_pCmdBuf); + m_pDevMode->StopInstructionTrace(m_pCmdBuf); m_instructionTrace.started = false; } #endif @@ -550,7 +550,7 @@ void SqttCmdBufferState::WriteUserEventMarker( // ==================================================================================================================== void SqttCmdBufferState::RgdAnnotateCmdBuf() { - if (m_pDevModeMgr->IsCrashAnalysisEnabled()) + if (m_pDevMode->IsCrashAnalysisEnabled()) { Pal::RgdMarkerInfoCmdBufData info = {}; info.header.infoType = Pal::RgdMarkerInfoTypeCmdBufStart; @@ -573,7 +573,7 @@ void SqttCmdBufferState::RgdAnnotateDispatch( { // CrashAnalysis already insert marker for all dispatches on PAL side. Here, we just provide additional context for // the described dispatch. - if (m_pDevModeMgr->IsCrashAnalysisEnabled()) + if (m_pDevMode->IsCrashAnalysisEnabled()) { if ((type == RgpSqttMarkerEventType::CmdDispatch) || (type == RgpSqttMarkerEventType::CmdDispatchIndirect) @@ -608,7 +608,7 @@ void SqttCmdBufferState::RgdAnnotateDraw( { // CrashAnalysis already insert marker for all draws that comes from application on PAL side. Here, we just provide // additional context for the described draw. - if (m_pDevModeMgr->IsCrashAnalysisEnabled()) + if (m_pDevMode->IsCrashAnalysisEnabled()) { if ((type == RgpSqttMarkerEventType::CmdDraw) || (type == RgpSqttMarkerEventType::CmdDrawIndexed)) { @@ -634,7 +634,7 @@ void SqttCmdBufferState::RgdInsertBarrierBeginMarker( Pal::Developer::BarrierType type, // Barrier type uint32 reason) // Reason for the barrier { - if (m_pDevModeMgr->IsCrashAnalysisEnabled() && + if (m_pDevMode->IsCrashAnalysisEnabled() && (m_currentEventType == RgpSqttMarkerEventType::CmdPipelineBarrier)) { Pal::RgdMarkerInfoBarrierBeginData info = {}; @@ -654,7 +654,7 @@ void SqttCmdBufferState::RgdInsertBarrierEndMarker( Pal::Developer::BarrierOperations operations) // What the barrier does { // CrashAnalysisCmdBuffer does not insert marker for Barrier. We insert as MarkerSource::Pal here. - if (m_pDevModeMgr->IsCrashAnalysisEnabled() && + if (m_pDevMode->IsCrashAnalysisEnabled() && (m_currentEventType == RgpSqttMarkerEventType::CmdPipelineBarrier)) { Pal::RgdMarkerInfoBarrierEndData info = {}; @@ -1049,12 +1049,12 @@ void SqttCmdBufferState::PipelineBound( const Pipeline* pPipeline = Pipeline::BaseObjectFromHandle(pipeline); #if ICD_GPUOPEN_DEVMODE_BUILD - if (m_pDevModeMgr != nullptr) + if (m_pDevMode != nullptr) { if ((m_instructionTrace.started == false) && (pPipeline->GetApiHash() == m_instructionTrace.targetHash)) { - m_pDevModeMgr->StartInstructionTrace(m_pCmdBuf); + m_pDevMode->StartInstructionTrace(m_pCmdBuf); m_instructionTrace.bindPoint = bindPoint; m_instructionTrace.started = true; } @@ -2231,14 +2231,14 @@ VKAPI_ATTR VkResult VKAPI_CALL vkCreateGraphicsPipelines( { Device* pDevice = ApiDevice::ObjectFromHandle(device); SqttMgr* pSqtt = pDevice->GetSqttMgr(); - DevModeMgr* pDevMgr = pDevice->VkInstance()->GetDevModeMgr(); + IDevMode* pDevMode = pDevice->VkInstance()->GetDevModeMgr(); VkResult result = SQTT_CALL_NEXT_LAYER(vkCreateGraphicsPipelines)(device, pipelineCache, createInfoCount, pCreateInfos, pAllocator, pPipelines); if (pDevice->GetRuntimeSettings().devModeShaderIsaDbEnable && (result == VK_SUCCESS) && - (pDevMgr != nullptr)) + (pDevMode != nullptr)) { for (uint32_t i = 0; i < createInfoCount; ++i) { @@ -2263,7 +2263,7 @@ VKAPI_ATTR VkResult VKAPI_CALL vkCreateGraphicsPipelines( } #if ICD_GPUOPEN_DEVMODE_BUILD - pDevMgr->PipelineCreated(pDevice, pPipeline); + pDevMode->PipelineCreated(pDevice, pPipeline); #endif } } @@ -2283,14 +2283,14 @@ VKAPI_ATTR VkResult VKAPI_CALL vkCreateComputePipelines( { Device* pDevice = ApiDevice::ObjectFromHandle(device); SqttMgr* pSqtt = pDevice->GetSqttMgr(); - DevModeMgr* pDevMgr = pDevice->VkInstance()->GetDevModeMgr(); + IDevMode* pDevMode = pDevice->VkInstance()->GetDevModeMgr(); VkResult result = SQTT_CALL_NEXT_LAYER(vkCreateComputePipelines)(device, pipelineCache, createInfoCount, pCreateInfos, pAllocator, pPipelines); if (pDevice->GetRuntimeSettings().devModeShaderIsaDbEnable && (result == VK_SUCCESS) && - (pDevMgr != nullptr)) + (pDevMode != nullptr)) { for (uint32_t i = 0; i < createInfoCount; ++i) { @@ -2311,7 +2311,7 @@ VKAPI_ATTR VkResult VKAPI_CALL vkCreateComputePipelines( } #if ICD_GPUOPEN_DEVMODE_BUILD - pDevMgr->PipelineCreated(pDevice, pPipeline); + pDevMode->PipelineCreated(pDevice, pPipeline); #endif } } @@ -2333,14 +2333,14 @@ VKAPI_ATTR VkResult VKAPI_CALL vkCreateRayTracingPipelinesKHR( { Device* pDevice = ApiDevice::ObjectFromHandle(device); SqttMgr* pSqtt = pDevice->GetSqttMgr(); - DevModeMgr* pDevMgr = pDevice->VkInstance()->GetDevModeMgr(); + IDevMode* pDevMode = pDevice->VkInstance()->GetDevModeMgr(); VkResult result = SQTT_CALL_NEXT_LAYER(vkCreateRayTracingPipelinesKHR)(device, deferredOperation, pipelineCache, createInfoCount, pCreateInfos, pAllocator, pPipelines); if (pDevice->GetRuntimeSettings().devModeShaderIsaDbEnable && ((result == VK_SUCCESS) || (result == VK_OPERATION_DEFERRED_KHR)) && - (pDevMgr != nullptr)) + (pDevMode != nullptr)) { for (uint32_t i = 0; i < createInfoCount; ++i) { @@ -2367,11 +2367,11 @@ VKAPI_ATTR VkResult VKAPI_CALL vkCreateRayTracingPipelinesKHR( #if ICD_GPUOPEN_DEVMODE_BUILD if (result != VK_OPERATION_DEFERRED_KHR) { - pDevMgr->PipelineCreated(pDevice, pPipeline); + pDevMode->PipelineCreated(pDevice, pPipeline); if (pPipeline->IsInlinedShaderEnabled() == false) { - pDevMgr->ShaderLibrariesCreated(pDevice, pPipeline); + pDevMode->ShaderLibrariesCreated(pDevice, pPipeline); } } #endif @@ -2534,16 +2534,16 @@ VKAPI_ATTR void VKAPI_CALL vkDestroyPipeline( { Device* pDevice = ApiDevice::ObjectFromHandle(device); SqttMgr* pSqtt = pDevice->GetSqttMgr(); - DevModeMgr* pDevMgr = pDevice->VkInstance()->GetDevModeMgr(); + IDevMode* pDevMode = pDevice->VkInstance()->GetDevModeMgr(); #if ICD_GPUOPEN_DEVMODE_BUILD - if (pDevice->GetRuntimeSettings().devModeShaderIsaDbEnable && (pDevMgr != nullptr)) + if (pDevice->GetRuntimeSettings().devModeShaderIsaDbEnable && (pDevMode != nullptr)) { if (VK_NULL_HANDLE != pipeline) { Pipeline* pPipeline = Pipeline::BaseObjectFromHandle(pipeline); - pDevMgr->PipelineDestroyed(pDevice, pPipeline); + pDevMode->PipelineDestroyed(pDevice, pPipeline); #if VKI_RAY_TRACING if (pPipeline->GetType() == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR) @@ -2552,7 +2552,7 @@ VKAPI_ATTR void VKAPI_CALL vkDestroyPipeline( if (pRtPipeline->IsInlinedShaderEnabled() == false) { - pDevMgr->ShaderLibrariesDestroyed(pDevice, pRtPipeline); + pDevMode->ShaderLibrariesDestroyed(pDevice, pRtPipeline); } } #endif @@ -2710,7 +2710,7 @@ VKAPI_ATTR VkResult VKAPI_CALL vkSetDebugUtilsObjectTagEXT( // calls but still want to start/stop RGP tracing. static void CheckRGPFrameBegin( Queue* pQueue, - DevModeMgr* pDevMode, + IDevMode* pDevMode, uint32_t submitCount, const VkSubmitInfo* pSubmits) { @@ -2732,7 +2732,7 @@ static void CheckRGPFrameBegin( if (pCmdBuf->HasDebugTag(frameBeginTag)) { - pDevMode->NotifyFrameBegin(pQueue, DevModeMgr::FrameDelimiterType::CmdBufferTag); + pDevMode->NotifyFrameBegin(pQueue, IDevMode::FrameDelimiterType::CmdBufferTag); return; } @@ -2745,7 +2745,7 @@ static void CheckRGPFrameBegin( // Looks for markers in a submitted command buffer to identify a forced end to an RGP trace. See CheckRGPFrameBegin(). static void CheckRGPFrameEnd( Queue* pQueue, - DevModeMgr* pDevMode, + IDevMode* pDevMode, uint32_t submitCount, const VkSubmitInfo* pSubmits) { @@ -2767,7 +2767,7 @@ static void CheckRGPFrameEnd( if (pCmdBuf->HasDebugTag(frameEndTag)) { - pDevMode->NotifyFrameEnd(pQueue, DevModeMgr::FrameDelimiterType::CmdBufferTag); + pDevMode->NotifyFrameEnd(pQueue, IDevMode::FrameDelimiterType::CmdBufferTag); return; } @@ -2786,7 +2786,7 @@ VKAPI_ATTR VkResult VKAPI_CALL vkQueueSubmit( { Queue* pQueue = ApiQueue::ObjectFromHandle(queue); SqttMgr* pSqtt = pQueue->VkDevice()->GetSqttMgr(); - DevModeMgr* pDevMode = pQueue->VkDevice()->VkInstance()->GetDevModeMgr(); + IDevMode* pDevMode = pQueue->VkDevice()->VkInstance()->GetDevModeMgr(); #if ICD_GPUOPEN_DEVMODE_BUILD pDevMode->NotifyPreSubmit(); diff --git a/icd/api/sqtt/sqtt_layer.h b/icd/api/sqtt/sqtt_layer.h index da9bbdc8..9d93e2f0 100644 --- a/icd/api/sqtt/sqtt_layer.h +++ b/icd/api/sqtt/sqtt_layer.h @@ -56,7 +56,7 @@ class ImageView; class RenderPass; class SqttMgr; class Pipeline; -class DevModeMgr; +class IDevMode; // Contains parameters that are happening when renderpass targets are bound in the driver. struct SqttBindTargetParams @@ -207,7 +207,7 @@ class SqttCmdBufferState CmdBuffer* m_pCmdBuf; SqttMgr* m_pSqttMgr; // Per-device SQTT state - DevModeMgr* m_pDevModeMgr; + IDevMode* m_pDevMode; const RuntimeSettings& m_settings; const DispatchTable* m_pNextLayer; // Pointer to next layer's dispatch table RgpSqttMarkerCbID m_cbId; // Command buffer ID associated with this command buffer diff --git a/icd/api/strings/entry_points.txt b/icd/api/strings/entry_points.txt index b3a9763a..483270db 100644 --- a/icd/api/strings/entry_points.txt +++ b/icd/api/strings/entry_points.txt @@ -531,6 +531,17 @@ vkGetDeviceFaultInfoEXT @device @dext(EXT_devi vkGetShaderModuleIdentifierEXT @device @dext(EXT_shader_module_identifier) vkGetShaderModuleCreateInfoIdentifierEXT @device @dext(EXT_shader_module_identifier) +vkCreateIndirectCommandsLayoutNV @device @dext(NV_device_generated_commands) +vkDestroyIndirectCommandsLayoutNV @device @dext(NV_device_generated_commands) +vkGetGeneratedCommandsMemoryRequirementsNV @device @dext(NV_device_generated_commands) +vkCmdPreprocessGeneratedCommandsNV @device @dext(NV_device_generated_commands) +vkCmdExecuteGeneratedCommandsNV @device @dext(NV_device_generated_commands) +vkCmdBindPipelineShaderGroupNV @device @dext(NV_device_generated_commands) + +vkCmdUpdatePipelineIndirectBufferNV @device @dext(NV_device_generated_commands_compute) +vkGetPipelineIndirectDeviceAddressNV @device @dext(NV_device_generated_commands_compute) +vkGetPipelineIndirectMemoryRequirementsNV @device @dext(NV_device_generated_commands_compute) + vkCmdSetTessellationDomainOriginEXT @device @dext(EXT_extended_dynamic_state3) vkCmdSetDepthClampEnableEXT @device @dext(EXT_extended_dynamic_state3) vkCmdSetPolygonModeEXT @device @dext(EXT_extended_dynamic_state3) diff --git a/icd/api/strings/extensions.txt b/icd/api/strings/extensions.txt index 0642bf17..86ba3001 100644 --- a/icd/api/strings/extensions.txt +++ b/icd/api/strings/extensions.txt @@ -207,10 +207,13 @@ VK_EXT_attachment_feedback_loop_layout VK_EXT_physical_device_drm VK_KHR_cooperative_matrix VK_EXT_texture_compression_astc_hdr +VK_NV_device_generated_commands +VK_NV_device_generated_commands_compute VK_EXT_image_drm_format_modifier VK_KHR_shader_expect_assume VK_KHR_shader_subgroup_rotate VK_KHR_shader_quad_control +VK_EXT_nested_command_buffer VK_KHR_dynamic_rendering_local_read VK_KHR_vertex_attribute_divisor VK_EXT_frame_boundary diff --git a/icd/api/strings/generate_strings.py b/icd/api/strings/generate_strings.py index 9fbe54a6..0eea3b1e 100644 --- a/icd/api/strings/generate_strings.py +++ b/icd/api/strings/generate_strings.py @@ -149,7 +149,7 @@ def generate_entry_point_condition(f, name, cond): def get_compile_condition(cond): """Assemble condition macro name""" cond = cond.replace('@none', '') - cond = cond.replace('@win32', '_WIN32') + cond = cond.replace('@win32', 'defined(_WIN32)') core = re.compile(r'@core(?:_build_only)?\( ( [^\.]* ) \. ( [^\)]* ) \)', re.VERBOSE) cond = core.sub(r'VK_VERSION_\1_\2', cond) diff --git a/icd/api/vk_cmdbuffer.cpp b/icd/api/vk_cmdbuffer.cpp index 10649402..8170a40f 100644 --- a/icd/api/vk_cmdbuffer.cpp +++ b/icd/api/vk_cmdbuffer.cpp @@ -43,6 +43,7 @@ #include "include/vk_utils.h" #include "include/vk_query.h" #include "include/vk_queue.h" +#include "include/vk_indirect_commands_layout.h" #if VKI_RAY_TRACING #include "raytrace/vk_acceleration_structure.h" @@ -599,6 +600,16 @@ CmdBuffer::CmdBuffer( m_reverseThreadGroupState(false) #if VKI_RAY_TRACING , m_scratchVidMemList(pDevice->VkInstance()->Allocator()) + , m_maxCpsMemSize(0) + , m_patchCpsList + { + pDevice->VkInstance()->Allocator(), +#if VKI_BUILD_MAX_NUM_GPUS > 1 + pDevice->VkInstance()->Allocator(), + pDevice->VkInstance()->Allocator(), + pDevice->VkInstance()->Allocator() +#endif + } #endif { m_flags.wasBegun = false; @@ -1310,6 +1321,8 @@ VkResult CmdBuffer::Begin( #if VKI_RAY_TRACING FreeRayTracingScratchVidMemory(); + + m_maxCpsMemSize = 0; #endif const PhysicalDevice* pPhysicalDevice = m_pDevice->VkPhysicalDevice(DefaultDeviceIndex); @@ -1854,6 +1867,7 @@ VkResult CmdBuffer::Reset(VkCommandBufferResetFlags flags) #if VKI_RAY_TRACING FreeRayTracingScratchVidMemory(); + FreePatchCpsList(); #endif result = PalToVkResult(PalCmdBufferReset(releaseResources)); @@ -2337,6 +2351,7 @@ VkResult CmdBuffer::Destroy(void) #if VKI_RAY_TRACING FreeRayTracingScratchVidMemory(); + FreePatchCpsList(); #endif @@ -2956,6 +2971,8 @@ void CmdBuffer::BindVertexBuffersUpdateBindingRange( const VkBuffer buffer = pBuffers[inputIdx]; const VkDeviceSize offset = pOffsets[inputIdx]; + bool padVertexBuffers = m_flags.padVertexBuffers; + if (buffer != VK_NULL_HANDLE) { const Buffer* pBuffer = Buffer::ObjectFromHandle(buffer); @@ -2964,6 +2981,12 @@ void CmdBuffer::BindVertexBuffersUpdateBindingRange( if ((pSizes != nullptr) && (pSizes[inputIdx] != VK_WHOLE_SIZE)) { pBinding->range = pSizes[inputIdx]; + + if (offset != 0) + { + padVertexBuffers = true; + } + } else { @@ -2981,7 +3004,7 @@ void CmdBuffer::BindVertexBuffersUpdateBindingRange( pBinding->stride = pStrides[inputIdx]; } - if (m_flags.padVertexBuffers && (pBinding->stride != 0)) + if (padVertexBuffers && (pBinding->stride != 0)) { pBinding->range = Util::RoundUpToMultiple(pBinding->range, pBinding->stride); } @@ -3112,7 +3135,7 @@ void CmdBuffer::Draw( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0, 0); #endif { @@ -3139,7 +3162,7 @@ void CmdBuffer::DrawIndexed( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0, 0); #endif { @@ -3155,7 +3178,7 @@ void CmdBuffer::DrawIndexed( } // ===================================================================================================================== -template< bool indexed, bool useBufferCount> +template void CmdBuffer::DrawIndirect( VkBuffer buffer, VkDeviceSize offset, @@ -3169,7 +3192,7 @@ void CmdBuffer::DrawIndirect( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0, 0); #endif Buffer* pBuffer = Buffer::ObjectFromHandle(buffer); @@ -3217,6 +3240,51 @@ void CmdBuffer::DrawIndirect( DbgBarrierPostCmd((indexed ? DbgBarrierDrawIndexed : DbgBarrierDrawNonIndexed) | DbgBarrierDrawIndirect); } +// ===================================================================================================================== +template +void CmdBuffer::DrawIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa) +{ + DbgBarrierPreCmd((indexed ? DbgBarrierDrawIndexed : DbgBarrierDrawNonIndexed) | DbgBarrierDrawIndirect); + + ValidateGraphicsStates(); + +#if VKI_RAY_TRACING + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0, 0); +#endif + + VK_ASSERT(stride <= indirectBufferSize); + Pal::GpuVirtAddrAndStride gpuVirtAddrAndStride = { indirectBufferVa, stride }; + + utils::IterateMask deviceGroup(m_curDeviceMask); + do + { + const uint32_t deviceIdx = deviceGroup.Index(); + + if (indexed == false) + { + PalCmdBuffer(deviceIdx)->CmdDrawIndirectMulti( + gpuVirtAddrAndStride, + count, + useBufferCount? countBufferVa : 0); + } + else + { + PalCmdBuffer(deviceIdx)->CmdDrawIndexedIndirectMulti( + gpuVirtAddrAndStride, + count, + useBufferCount ? countBufferVa : 0); + } + } + while (deviceGroup.IterateNext()); + + DbgBarrierPostCmd((indexed ? DbgBarrierDrawIndexed : DbgBarrierDrawNonIndexed) | DbgBarrierDrawIndirect); +} + // ===================================================================================================================== void CmdBuffer::DrawMeshTasks( uint32_t x, @@ -3230,7 +3298,7 @@ void CmdBuffer::DrawMeshTasks( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0, 0); #endif PalCmdDrawMeshTasks(x, y, z); @@ -3254,7 +3322,7 @@ void CmdBuffer::DrawMeshTasksIndirect( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0, 0); #endif PalCmdDrawMeshTasksIndirect(buffer, offset, count, stride, countBuffer, countOffset); @@ -3262,6 +3330,40 @@ void CmdBuffer::DrawMeshTasksIndirect( DbgBarrierPostCmd(DbgBarrierDrawMeshTasksIndirect); } +// ===================================================================================================================== +template +void CmdBuffer::DrawMeshTasksIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa) +{ + DbgBarrierPreCmd(DbgBarrierDrawMeshTasksIndirect); + + ValidateGraphicsStates(); + +#if VKI_RAY_TRACING + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0, 0); +#endif + + VK_ASSERT(stride <= indirectBufferSize); + + Pal::GpuVirtAddrAndStride gpuVirtAddrAndStride = { indirectBufferVa, stride }; + + utils::IterateMask deviceGroup(m_curDeviceMask); + do + { + PalCmdBuffer(deviceGroup.Index())->CmdDispatchMeshIndirectMulti( + gpuVirtAddrAndStride, + count, + useBufferCount? countBufferVa : 0); + + } while (deviceGroup.IterateNext()); + + DbgBarrierPostCmd(DbgBarrierDrawMeshTasksIndirect); +} + // ===================================================================================================================== void CmdBuffer::Dispatch( uint32_t x, @@ -3276,7 +3378,7 @@ void CmdBuffer::Dispatch( } #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute, x, y, z, nullptr, 0); + BindRayQueryConstants(m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute, x, y, z, nullptr, 0, 0); #endif if (m_pDevice->GetRuntimeSettings().enableAlternatingThreadGroupOrder) @@ -3307,7 +3409,7 @@ void CmdBuffer::DispatchOffset( #if VKI_RAY_TRACING BindRayQueryConstants( - m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute, dim_x, dim_y, dim_z, nullptr, 0); + m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute, dim_x, dim_y, dim_z, nullptr, 0, 0); #endif PalCmdDispatchOffset(base_x, base_y, base_z, dim_x, dim_y, dim_z); @@ -3336,7 +3438,8 @@ void CmdBuffer::DispatchIndirect( 0, 0, pBuffer, - offset); + offset, + 0); #endif PalCmdDispatchIndirect(pBuffer, offset); @@ -3344,6 +3447,51 @@ void CmdBuffer::DispatchIndirect( DbgBarrierPostCmd(DbgBarrierDispatchIndirect); } +// ===================================================================================================================== +void CmdBuffer::DispatchIndirect( + VkDeviceSize indirectBufferVa) +{ + DbgBarrierPreCmd(DbgBarrierDispatchIndirect); + + if (PalPipelineBindingOwnedBy(Pal::PipelineBindPoint::Compute, PipelineBindCompute) == false) + { + RebindPipeline(); + } + +#if VKI_RAY_TRACING + BindRayQueryConstants( + m_allGpuState.pComputePipeline, Pal::PipelineBindPoint::Compute, 0, 0, 0, nullptr, 0, indirectBufferVa); +#endif + + utils::IterateMask deviceGroup(m_curDeviceMask); + do + { + const uint32_t deviceIdx = deviceGroup.Index(); + PalCmdBuffer(deviceIdx)->CmdDispatchIndirect(indirectBufferVa); + } + while (deviceGroup.IterateNext()); + + DbgBarrierPostCmd(DbgBarrierDispatchIndirect); +} + +// ===================================================================================================================== +void CmdBuffer::ExecuteIndirect( + VkBool32 isPreprocessed, + const VkGeneratedCommandsInfoNV* pInfo) +{ + IndirectCommandsLayout* pLayout = IndirectCommandsLayout::ObjectFromHandle(pInfo->indirectCommandsLayout); + + utils::IterateMask deviceGroup(m_curDeviceMask); + do + { + const uint32_t deviceIdx = deviceGroup.Index(); + pLayout->BindPreprocessBuffer(pInfo->preprocessBuffer, + pInfo->preprocessOffset, + deviceIdx); + } + while (deviceGroup.IterateNext()); +} + // ===================================================================================================================== // Performs a color clear (vkCmdClearColorImage) void CmdBuffer::ClearColorImage( @@ -3410,7 +3558,7 @@ void CmdBuffer::ClearColorImage( pPalRanges, 0, nullptr, - settings.enableColorClearAutoSync ? Pal::ColorClearAutoSync : 0); + settings.enableColorClearAutoSync ? static_cast(Pal::ColorClearAutoSync) : 0); } virtStackFrame.FreeArray(pPalRanges); @@ -4882,7 +5030,7 @@ void CmdBuffer::PostDrawPreResolveSync() barrierInfo.transitionCount = 1; barrierInfo.pTransitions = &transition; - PalCmdBuffer(DefaultDeviceIndex)->CmdBarrier(barrierInfo); + PalCmdBarrier(barrierInfo, m_curDeviceMask); } // ===================================================================================================================== @@ -8027,9 +8175,7 @@ void CmdBuffer::RPSyncPoint( // Execute the barrier if it actually did anything if ((acquireReleaseInfo.dstGlobalStageMask != Pal::PipelineStageBottomOfPipe) || - ((acquireReleaseInfo.imageBarrierCount > 0) && isDstStageNotBottomOfPipe) || - ((rpBarrier.pipePointCount > 1) || - ((rpBarrier.pipePointCount == 1) && (rpBarrier.pipePoints[0] != Pal::HwPipeTop)))) + ((acquireReleaseInfo.imageBarrierCount > 0) && isDstStageNotBottomOfPipe)) { PalCmdReleaseThenAcquire( &acquireReleaseInfo, @@ -9797,49 +9943,48 @@ void CmdBuffer::DbgCmdBarrier(bool preCmd) (static_cast(Pal::CacheCoherencyUsageFlags::CoherPresent) == CoherPresent)), "The PAL::CacheCoherencyUsageFlags enum has changed. Vulkan settings might need to be updated."); - Pal::HwPipePoint waitPoint; - Pal::HwPipePoint signalPoint; + uint32_t srcStageMask; + uint32_t dstStageMask; + uint32_t srcCacheMask; uint32_t dstCacheMask; if (preCmd) { - waitPoint = static_cast(settings.dbgBarrierPreWaitPipePoint); - signalPoint = static_cast(settings.dbgBarrierPreSignalPipePoint); + dstStageMask = ConvertWaitPointToPipeStage( + static_cast(settings.dbgBarrierPreWaitPipePoint)); + srcStageMask = ConvertPipePointToPipeStage( + static_cast(settings.dbgBarrierPreSignalPipePoint)); srcCacheMask = settings.dbgBarrierPreCacheSrcMask; dstCacheMask = settings.dbgBarrierPreCacheDstMask; } else { - waitPoint = static_cast(settings.dbgBarrierPostWaitPipePoint); - signalPoint = static_cast(settings.dbgBarrierPostSignalPipePoint); + dstStageMask = ConvertWaitPointToPipeStage( + static_cast(settings.dbgBarrierPostWaitPipePoint)); + srcStageMask = ConvertPipePointToPipeStage( + static_cast(settings.dbgBarrierPostSignalPipePoint)); srcCacheMask = settings.dbgBarrierPostCacheSrcMask; dstCacheMask = settings.dbgBarrierPostCacheDstMask; } - Pal::BarrierInfo barrier = {}; + Pal::AcquireReleaseInfo barrier = {}; - barrier.reason = RgpBarrierUnknownReason; // This code is debug-only code. - barrier.waitPoint = waitPoint; + barrier.reason = RgpBarrierUnknownReason; // This code is debug-only code. + barrier.dstGlobalStageMask = dstStageMask; - if (waitPoint != Pal::HwPipeTop || signalPoint != Pal::HwPipeTop) + if ((dstStageMask != Pal::PipelineStageTopOfPipe) || (srcStageMask != Pal::PipelineStageTopOfPipe)) { - barrier.pipePointWaitCount = 1; - barrier.pPipePoints = &signalPoint; + barrier.srcGlobalStageMask = srcStageMask; } - Pal::BarrierTransition transition = {}; - if (srcCacheMask != 0 || dstCacheMask != 0) { - transition.srcCacheMask = srcCacheMask; - transition.dstCacheMask = dstCacheMask; - - barrier.transitionCount = 1; - barrier.pTransitions = &transition; + barrier.srcGlobalAccessMask = srcCacheMask; + barrier.dstGlobalAccessMask = dstCacheMask; } - PalCmdBarrier(barrier, m_curDeviceMask); + PalCmdReleaseThenAcquire(barrier, m_curDeviceMask); } #endif @@ -10055,7 +10200,7 @@ void CmdBuffer::DrawIndirectByteCount( ValidateGraphicsStates(); #if VKI_RAY_TRACING - BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0); + BindRayQueryConstants(m_allGpuState.pGraphicsPipeline, Pal::PipelineBindPoint::Graphics, 0, 0, 0, nullptr, 0, 0); #endif utils::IterateMask deviceGroup(m_curDeviceMask); @@ -10456,12 +10601,55 @@ void CmdBuffer::TraceRays( while (deviceGroup.IterateNext()); } +// ===================================================================================================================== +void CmdBuffer::AddPatchCpsRequest( + uint32_t deviceIdx, + GpuRt::DispatchRaysConstants* pConstsMem, + uint64_t bufSize) +{ + VK_ASSERT(pConstsMem != nullptr); + m_maxCpsMemSize = Util::Max(m_maxCpsMemSize, bufSize); + Pal::Result result = m_patchCpsList[deviceIdx].PushBack(pConstsMem); + VK_ASSERT(result == Pal::Result::Success); +} + +// ===================================================================================================================== +void CmdBuffer::FreePatchCpsList() +{ + utils::IterateMask deviceGroup(m_curDeviceMask); + + do + { + const uint32_t deviceIdx = deviceGroup.Index(); + m_patchCpsList[deviceIdx].Clear(); + } + while (deviceGroup.IterateNext()); +} + +// ===================================================================================================================== +// Fill bufVa to each patch request (call this at execute time). +void CmdBuffer::ApplyPatchCpsRequests( + uint32_t deviceIdx, + const Pal::IGpuMemory& cpsMem) const +{ + for (PatchCpsVector::Iter iter = m_patchCpsList[deviceIdx].Begin(); iter.Get() != nullptr; iter.Next()) + { + GpuRt::DispatchRaysConstants* pConstsMem = iter.Get(); + + m_pDevice->RayTrace()->GpuRt(deviceIdx)->PatchDispatchRaysConstants( + pConstsMem, + cpsMem.Desc().gpuVirtAddr, + m_maxCpsMemSize); + } +} + // ===================================================================================================================== void CmdBuffer::GetRayTracingDispatchArgs( uint32_t deviceIdx, const RuntimeSettings& settings, CmdPool* pCmdPool, const RayTracingPipeline* pPipeline, + uint32* pConstMem, Pal::gpusize constGpuAddr, uint32_t width, uint32_t height, @@ -10509,8 +10697,16 @@ void CmdBuffer::GetRayTracingDispatchArgs( pConstants->constData.cpsBackendStackSize = stackSizes.backendSize; if (settings.cpsFlags & CpsFlagStackInGlobalMem) { - // TODO: Record Cps stack requirement, create Cps stack at queue submission, and fill - // pConstants->constData.cpsGlobalMemoryAddressLo/Hi + const uint32 numRays = width * height * depth; + + const gpusize cpsMemorySize = m_pDevice->RayTrace()->GpuRt(deviceIdx)->GetCpsMemoryBytes( + stackSizes.frontendSize, + numRays); + + AddPatchCpsRequest( + deviceIdx, + reinterpret_cast(pConstMem), + cpsMemorySize); } } @@ -10559,9 +10755,9 @@ void CmdBuffer::TraceRayPreSetup( const RuntimeSettings& settings = m_pDevice->GetRuntimeSettings(); const RayTracingPipeline* pPipeline = m_allGpuState.pRayTracingPipeline; - void* pConstData = PalCmdBuffer(deviceIdx)->CmdAllocateEmbeddedData(GpuRt::DispatchRaysConstantsDw, - 1, - pConstGpuAddr); + uint32* pConstMem = PalCmdBuffer(deviceIdx)->CmdAllocateEmbeddedData(GpuRt::DispatchRaysConstantsDw, + 1, + pConstGpuAddr); GpuRt::DispatchRaysConstants constants = {}; @@ -10569,6 +10765,7 @@ void CmdBuffer::TraceRayPreSetup( settings, m_pCmdPool, pPipeline, + pConstMem, *pConstGpuAddr, width, height, @@ -10579,7 +10776,7 @@ void CmdBuffer::TraceRayPreSetup( callableShaderBindingTable, &constants); - memcpy(pConstData, &constants, sizeof(constants)); + memcpy(pConstMem, &constants, sizeof(constants)); } // ===================================================================================================================== @@ -10958,7 +11155,8 @@ void CmdBuffer::BindRayQueryConstants( uint32_t height, uint32_t depth, Buffer* pIndirectBuffer, - VkDeviceSize indirectOffset) + VkDeviceSize indirectOffset, + VkDeviceSize indirectBufferVirtAddr) { if ((pPipeline != nullptr) && pPipeline->HasRayTracing()) { @@ -11004,7 +11202,7 @@ void CmdBuffer::BindRayQueryConstants( gpusize indirectBufferVa = (pIndirectBuffer != nullptr) ? pIndirectBuffer->GpuVirtAddr(deviceIdx) + indirectOffset : - 0; + indirectBufferVirtAddr; if (indirectBufferVa == 0) { @@ -11078,10 +11276,10 @@ void CmdBuffer::InsertDebugMarker( #if ICD_GPUOPEN_DEVMODE_BUILD constexpr uint8 MarkerSourceApplication = 0; - const DevModeMgr* pDevModeMgr = m_pDevice->VkInstance()->GetDevModeMgr(); + const IDevMode* pDevMode = m_pDevice->VkInstance()->GetDevModeMgr(); // Insert Crash Analysis markers if requested - if ((pDevModeMgr != nullptr) && (pDevModeMgr->IsCrashAnalysisEnabled())) + if ((pDevMode != nullptr) && (pDevMode->IsCrashAnalysisEnabled())) { PalCmdBuffer(DefaultDeviceIndex)->CmdInsertExecutionMarker(isBegin, MarkerSourceApplication, @@ -11374,6 +11572,7 @@ void CmdBuffer::ValidateGraphicsStates() params.pPipeline = pGraphicsPipeline->GetPalPipeline(deviceIdx); params.gfxDynState = m_allGpuState.pipelineState[PipelineBindGraphics].dynamicBindInfo.gfxDynState; + params.gfxShaderInfo = pGraphicsPipeline->GetBindInfo(); if (params.gfxDynState.enable.depthClampMode && (params.gfxDynState.enable.depthClipMode == false)) { @@ -11463,16 +11662,16 @@ void CmdBuffer::ValidateGraphicsStates() Device::SetDefaultVrsRateParams(&vrsRate); } - if (m_allGpuState.minSampleShading > 0.0) + // Both MSAA and VRS would utilize the value of PS_ITER_SAMPLES + // Thus, choose the min combiner (i.e. choose the higher quality rate) when both features are + // enabled + if ((m_allGpuState.msaaCreateInfo.pixelShaderSamples > 1) && + (m_allGpuState.vrsRate.flags.exposeVrsPixelsMask == 1) && + (pGraphicsPipeline != nullptr) && + (pGraphicsPipeline->GetPipelineFlags().shadingRateUsedInShader == false)) { - if ((m_allGpuState.vrsRate.shadingRate == Pal::VrsShadingRate::_1x1) && - (pGraphicsPipeline != nullptr) && - (pGraphicsPipeline->GetPipelineFlags().shadingRateUsedInShader == false) && - pGraphicsPipeline->ContainsDynamicState(DynamicStatesInternal::FragmentShadingRateStateKhr)) - { - vrsRate.combinerState[static_cast(Pal::VrsCombinerStage::PsIterSamples)] = - Pal::VrsCombiner::Override; - } + vrsRate.combinerState[static_cast(Pal::VrsCombinerStage::PsIterSamples)] = + Pal::VrsCombiner::Min; } PalCmdBuffer(deviceIdx)->CmdSetPerDrawVrsRate(vrsRate); @@ -12655,6 +12854,38 @@ void CmdBuffer::DrawIndirect( VkBuffer countBuffer, VkDeviceSize countOffset); +template +void CmdBuffer::DrawIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa); + +template +void CmdBuffer::DrawIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa); + +template +void CmdBuffer::DrawIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa); + +template +void CmdBuffer::DrawIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa); + template void CmdBuffer::DrawMeshTasksIndirect( VkBuffer buffer, @@ -12673,6 +12904,22 @@ void CmdBuffer::DrawMeshTasksIndirect( VkBuffer countBuffer, VkDeviceSize countOffset); +template +void CmdBuffer::DrawMeshTasksIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa); + +template +void CmdBuffer::DrawMeshTasksIndirect( + VkDeviceSize indirectBufferVa, + VkDeviceSize indirectBufferSize, + uint32_t count, + uint32_t stride, + VkDeviceSize countBufferVa); + template void CmdBuffer::ResolveImage( VkImage srcImage, diff --git a/icd/api/vk_compute_pipeline.cpp b/icd/api/vk_compute_pipeline.cpp index 2885aa4a..d2965fe0 100644 --- a/icd/api/vk_compute_pipeline.cpp +++ b/icd/api/vk_compute_pipeline.cpp @@ -105,9 +105,10 @@ VkResult ComputePipeline::CreatePipelineBinaries( Vkgc::BinaryData* pPipelineBinaries, PipelineMetadata* pBinaryMetadata) { - VkResult result = VK_SUCCESS; - const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); - PipelineCompiler* pDefaultCompiler = pDevice->GetCompiler(DefaultDeviceIndex); + VkResult result = VK_SUCCESS; + const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); + PipelineCompiler* pDefaultCompiler = pDevice->GetCompiler(DefaultDeviceIndex); + bool storeBinaryToCache = true; // Load or create the pipeline binary PipelineBinaryCache* pPipelineBinaryCache = (pPipelineCache != nullptr) ? pPipelineCache->GetPipelineCache() @@ -121,7 +122,7 @@ VkResult ComputePipeline::CreatePipelineBinaries( if (shouldCompile) { - bool skipCacheQuery = settings.enablePipelineDump; + bool skipCacheQuery = false; if (skipCacheQuery == false) { @@ -140,21 +141,60 @@ VkResult ComputePipeline::CreatePipelineBinaries( } } - // Compile if unable to retrieve from cache if (shouldCompile) { - if (pBinaryCreateInfo->pTempBuffer == nullptr) + if ((pDevice->GetRuntimeSettings().ignoreFlagFailOnPipelineCompileRequired == false) && + (flags & VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR)) { - result = pDefaultCompiler->ConvertComputePipelineInfo( - pDevice, - pCreateInfo, - pShaderInfo, - pPipelineOptimizerKey, - pBinaryMetadata, - pBinaryCreateInfo, - flags); + result = VK_PIPELINE_COMPILE_REQUIRED_EXT; + } + } + + bool shouldConvert = (pCreateInfo != nullptr) && + (pDevice->GetRuntimeSettings().enablePipelineDump || + (shouldCompile && (pBinaryCreateInfo->pTempBuffer == nullptr))); + + VkResult convertResult = VK_ERROR_UNKNOWN; + if (shouldConvert) + { + convertResult = pDefaultCompiler->ConvertComputePipelineInfo( + pDevice, + pCreateInfo, + pShaderInfo, + pPipelineOptimizerKey, + pBinaryMetadata, + pBinaryCreateInfo, + flags); + result = (result == VK_SUCCESS) ? convertResult : result; + } + + if ((result == VK_SUCCESS) && (convertResult == VK_SUCCESS) && shouldCompile) + { + if (IsShaderModuleIdentifier(pBinaryCreateInfo->pipelineInfo.cs)) + { + result = VK_ERROR_UNKNOWN; + } + } + + if (pDevice->GetRuntimeSettings().enablePipelineDump && (convertResult == VK_SUCCESS)) + { + if ((shouldCompile == false) || (result != VK_SUCCESS)) + { + Vkgc::PipelineBuildInfo pipelineInfo = {}; + pipelineInfo.pComputeInfo = &pBinaryCreateInfo->pipelineInfo; + pDefaultCompiler->DumpPipeline( + pDevice->GetRuntimeSettings(), + pipelineInfo, + pBinaryCreateInfo->apiPsoHash, + 1, + &pPipelineBinaries[deviceIdx], + result); } + } + // Compile if unable to retrieve from cache + if (shouldCompile) + { if (result == VK_SUCCESS) { result = pDevice->GetCompiler(deviceIdx)->CreateComputePipelineBinary( @@ -185,7 +225,8 @@ VkResult ComputePipeline::CreatePipelineBinaries( } // Add to any cache layer where missing - if (result == VK_SUCCESS) + if ((result == VK_SUCCESS) && storeBinaryToCache) + { pDevice->GetCompiler(deviceIdx)->CachePipelineBinary( &pCacheIds[deviceIdx], @@ -330,45 +371,48 @@ VkResult ComputePipeline::Create( uint64 startTimeTicks = Util::GetPerfCpuTime(); // Setup PAL create info from Vulkan inputs - Vkgc::BinaryData pipelineBinaries[MaxPalDevices] = {}; - Util::MetroHash::Hash cacheId[MaxPalDevices] = {}; - PipelineCompiler* pDefaultCompiler = pDevice->GetCompiler(DefaultDeviceIndex); - const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); - ComputePipelineBinaryCreateInfo binaryCreateInfo = {}; - PipelineOptimizerKey pipelineOptimizerKey = {}; - ShaderOptimizerKey shaderOptimizerKey = {}; - ShaderModuleHandle tempModule = {}; - VkResult result = VK_SUCCESS; - PipelineMetadata binaryMetadata = {}; - ComputePipelineExtStructs extStructs = {}; + Vkgc::BinaryData pipelineBinaries[MaxPalDevices] = {}; + Util::MetroHash::Hash cacheId[MaxPalDevices] = {}; + PipelineCompiler* pDefaultCompiler = pDevice->GetCompiler(DefaultDeviceIndex); + const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); + ComputePipelineBinaryCreateInfo binaryCreateInfo = {}; + PipelineOptimizerKey pipelineOptimizerKey = {}; + ShaderOptimizerKey shaderOptimizerKey = {}; + ShaderModuleHandle tempModule = {}; + VkResult result = VK_SUCCESS; + PipelineMetadata binaryMetadata = {}; + ComputePipelineExtStructs extStructs = {}; + bool binariesProvided = false; HandleExtensionStructs(pCreateInfo, &extStructs); ComputePipelineShaderStageInfo shaderInfo = {}; uint64_t apiPsoHash = {}; - // 1. Create Cache IDs - result = ComputePipeline::CreateCacheId( - pDevice, - pCreateInfo, - flags, - &shaderInfo, - &binaryCreateInfo, - &shaderOptimizerKey, - &pipelineOptimizerKey, - &apiPsoHash, - &tempModule, - &cacheId[0]); - - binaryCreateInfo.apiPsoHash = apiPsoHash; - - // 2. Create pipeline binaries (or load from cache) auto pPipelineCreationFeedbackCreateInfo = extStructs.pPipelineCreationFeedbackCreateInfoEXT; PipelineCompiler::InitPipelineCreationFeedback(pPipelineCreationFeedbackCreateInfo); - if (result == VK_SUCCESS) + if ((result == VK_SUCCESS) && (binariesProvided == false)) { + // 1. Create Cache IDs + result = ComputePipeline::CreateCacheId( + pDevice, + pCreateInfo, + flags, + &shaderInfo, + &binaryCreateInfo, + &shaderOptimizerKey, + &pipelineOptimizerKey, + &apiPsoHash, + &tempModule, + &cacheId[0]); + + binaryCreateInfo.apiPsoHash = apiPsoHash; + + // 2. Create pipeline binaries (or load from cache) + if (result == VK_SUCCESS) + { result = CreatePipelineBinaries( pDevice, pCreateInfo, @@ -381,6 +425,8 @@ VkResult ComputePipeline::Create( cacheId, pipelineBinaries, &binaryMetadata); + } + } CreateInfo localPipelineInfo = {}; @@ -411,9 +457,11 @@ VkResult ComputePipeline::Create( pDevice->PalDevice(DefaultDeviceIndex)->GetComputePipelineSize(localPipelineInfo.pipeline, &palResult); VK_ASSERT(palResult == Pal::Result::Success); + size_t allocationSize = sizeof(ComputePipeline) + (pipelineSize * pDevice->NumPalDevices()); + pSystemMem = pDevice->AllocApiObject( pAllocator, - sizeof(ComputePipeline) + (pipelineSize * pDevice->NumPalDevices())); + allocationSize); if (pSystemMem == nullptr) { @@ -483,6 +531,7 @@ VkResult ComputePipeline::Create( } result = PalToVkResult(palResult); + } if (result == VK_SUCCESS) @@ -525,6 +574,7 @@ VkResult ComputePipeline::Create( } else { + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) { // Internal memory allocation failed, free PAL event object if it gets created @@ -541,7 +591,7 @@ VkResult ComputePipeline::Create( // Free the created pipeline binaries now that the PAL Pipelines/PipelineBinaryInfo have read them. for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) { - if (pipelineBinaries[deviceIdx].pCode != nullptr) + if ((binariesProvided == false) && (pipelineBinaries[deviceIdx].pCode != nullptr)) { pDevice->GetCompiler(deviceIdx)->FreeComputePipelineBinary( &binaryCreateInfo, pipelineBinaries[deviceIdx]); @@ -608,7 +658,7 @@ VkResult ComputePipeline::Create( // ===================================================================================================================== // Create cacheId for a compute pipeline. VkResult ComputePipeline::CreateCacheId( - Device* pDevice, + const Device* pDevice, const VkComputePipelineCreateInfo* pCreateInfo, VkPipelineCreateFlags2KHR flags, ComputePipelineShaderStageInfo* pShaderInfo, diff --git a/icd/api/vk_conv.cpp b/icd/api/vk_conv.cpp index 918d9548..d43cd44f 100644 --- a/icd/api/vk_conv.cpp +++ b/icd/api/vk_conv.cpp @@ -899,6 +899,62 @@ const char* VkResultName( case VkResult::VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT: errName = "VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT"; break; + + case VkResult::VK_ERROR_IMAGE_USAGE_NOT_SUPPORTED_KHR: + errName = "VK_ERROR_IMAGE_USAGE_NOT_SUPPORTED_KHR"; + break; + + case VkResult::VK_ERROR_VIDEO_PICTURE_LAYOUT_NOT_SUPPORTED_KHR: + errName = "VK_ERROR_VIDEO_PICTURE_LAYOUT_NOT_SUPPORTED_KHR"; + break; + + case VkResult::VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR: + errName = "VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR"; + break; + + case VkResult::VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR: + errName = "VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR"; + break; + + case VkResult::VK_ERROR_VIDEO_PROFILE_CODEC_NOT_SUPPORTED_KHR: + errName = "VK_ERROR_VIDEO_PROFILE_CODEC_NOT_SUPPORTED_KHR"; + break; + + case VkResult::VK_ERROR_VIDEO_STD_VERSION_NOT_SUPPORTED_KHR: + errName = "VK_ERROR_VIDEO_STD_VERSION_NOT_SUPPORTED_KHR"; + break; + + case VkResult::VK_THREAD_IDLE_KHR: + errName = "VK_THREAD_IDLE_KHR"; + break; + + case VkResult::VK_THREAD_DONE_KHR: + errName = "VK_THREAD_DONE_KHR"; + break; + case VkResult::VK_OPERATION_DEFERRED_KHR: + errName = "VK_OPERATION_DEFERRED_KHR"; + break; + + case VkResult::VK_OPERATION_NOT_DEFERRED_KHR: + errName = "VK_OPERATION_NOT_DEFERRED_KHR"; + break; + + case VkResult::VK_ERROR_INVALID_VIDEO_STD_PARAMETERS_KHR: + errName = "VK_ERROR_INVALID_VIDEO_STD_PARAMETERS_KHR"; + break; + + case VkResult::VK_ERROR_COMPRESSION_EXHAUSTED_EXT: + errName = "VK_ERROR_COMPRESSION_EXHAUSTED_EXT"; + break; + + case VkResult::VK_ERROR_PIPELINE_COMPILE_REQUIRED_EXT: + errName = "VK_ERROR_PIPELINE_COMPILE_REQUIRED_EXT"; + break; + + case VkResult::VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT: + errName = "VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT"; + break; + default: VK_NOT_IMPLEMENTED; errName = "??"; diff --git a/icd/api/vk_descriptor_pool.cpp b/icd/api/vk_descriptor_pool.cpp index 7fc6172c..567b1495 100644 --- a/icd/api/vk_descriptor_pool.cpp +++ b/icd/api/vk_descriptor_pool.cpp @@ -566,7 +566,9 @@ VkResult DescriptorGpuMemHeap::Init( if (pTypeCount[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) { uint32_t maxSize = 0; - if (pMutableDescriptorTypeCreateInfoEXT != nullptr) + if ((pMutableDescriptorTypeCreateInfoEXT != nullptr) && + (pMutableDescriptorTypeCreateInfoEXT->pMutableDescriptorTypeLists != nullptr) && + (i < pMutableDescriptorTypeCreateInfoEXT->mutableDescriptorTypeListCount)) { const VkMutableDescriptorTypeListEXT& list = pMutableDescriptorTypeCreateInfoEXT->pMutableDescriptorTypeLists[i]; diff --git a/icd/api/vk_device.cpp b/icd/api/vk_device.cpp index 86a3006b..98db2cc7 100644 --- a/icd/api/vk_device.cpp +++ b/icd/api/vk_device.cpp @@ -69,6 +69,7 @@ #include "include/graphics_pipeline_common.h" #include "include/vk_graphics_pipeline_library.h" #include "include/internal_layer_hooks.h" +#include "include/vk_indirect_commands_layout.h" #if VKI_RAY_TRACING #include "raytrace/ray_tracing_device.h" @@ -2394,7 +2395,10 @@ VkResult Device::WaitForFences( ppPalFences[i] = Fence::ObjectFromHandle(pFences[i])->PalFence(DefaultDeviceIndex); } - palResult = PalDevice(DefaultDeviceIndex)->WaitForFences(fenceCount, ppPalFences, waitAll != VK_FALSE, timeout); + palResult = PalDevice(DefaultDeviceIndex)->WaitForFences(fenceCount, + ppPalFences, + waitAll != VK_FALSE, + Uint64ToChronoNano(timeout)); } else { @@ -2424,7 +2428,7 @@ VkResult Device::WaitForFences( palResult = PalDevice(deviceIdx)->WaitForFences(perDeviceFenceCount, ppPalFences, waitAll != VK_FALSE, - timeout); + Uint64ToChronoNano(timeout)); } } } @@ -3172,8 +3176,9 @@ VkResult Device::WaitSemaphores( { flags |= Pal::HostWaitFlags::HostWaitAny; } + palResult = PalDevice(DefaultDeviceIndex)->WaitForSemaphores(pWaitInfo->semaphoreCount, ppPalSemaphores, - pWaitInfo->pValues, flags, timeout); + pWaitInfo->pValues, flags, Uint64ToChronoNano(timeout)); return PalToVkResult(palResult); } @@ -3640,6 +3645,15 @@ VkResult Device::AllocBorderColorPalette() return result; } +// ================================================================================================================= +VkResult Device::CreateIndirectCommandsLayout( + const VkIndirectCommandsLayoutCreateInfoNV* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkIndirectCommandsLayoutNV* pIndirectCommandsLayout) +{ + return IndirectCommandsLayout::Create(this, pCreateInfo, pAllocator, pIndirectCommandsLayout); +} + // ===================================================================================================================== void Device::DestroyBorderColorPalette() { @@ -5383,6 +5397,45 @@ VKAPI_ATTR void VKAPI_CALL vkGetImageSubresourceLayout2KHR( &pSubresource->imageSubresource, &pLayout->subresourceLayout); } +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkGetGeneratedCommandsMemoryRequirementsNV( + VkDevice device, + const VkGeneratedCommandsMemoryRequirementsInfoNV* pInfo, + VkMemoryRequirements2* pMemoryRequirements) +{ + const Device* pDevice = ApiDevice::ObjectFromHandle(device); + const IndirectCommandsLayout* pLayout = IndirectCommandsLayout::ObjectFromHandle(pInfo->indirectCommandsLayout); + + pLayout->CalculateMemoryRequirements(pDevice, pMemoryRequirements); +} + +// ===================================================================================================================== +VKAPI_ATTR VkResult VKAPI_CALL vkCreateIndirectCommandsLayoutNV( + VkDevice device, + const VkIndirectCommandsLayoutCreateInfoNV* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkIndirectCommandsLayoutNV* pIndirectCommandsLayout) +{ + Device* pDevice = ApiDevice::ObjectFromHandle(device); + const VkAllocationCallbacks* pAllocCB = pAllocator ? pAllocator : pDevice->VkInstance()->GetAllocCallbacks(); + + return pDevice->CreateIndirectCommandsLayout(pCreateInfo, pAllocCB, pIndirectCommandsLayout); +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkDestroyIndirectCommandsLayoutNV( + VkDevice device, + VkIndirectCommandsLayoutNV indirectCommandsLayout, + const VkAllocationCallbacks* pAllocator) +{ + if (indirectCommandsLayout != VK_NULL_HANDLE) + { + Device* pDevice = ApiDevice::ObjectFromHandle(device); + const VkAllocationCallbacks* pAllocCB = pAllocator ? pAllocator : pDevice->VkInstance()->GetAllocCallbacks(); + + IndirectCommandsLayout::ObjectFromHandle(indirectCommandsLayout)->Destroy(pDevice, pAllocCB); + } +} } // entry @@ -5393,3 +5446,6 @@ template VkPipelineCreateFlags2KHR vk::Device::GetPipelineCreateFlags( const VkRayTracingPipelineCreateInfoKHR* pCreateInfo); #endif +template +VkPipelineCreateFlags2KHR vk::Device::GetPipelineCreateFlags( + const VkComputePipelineCreateInfo* pCreateInfo); diff --git a/icd/api/vk_dispatch.cpp b/icd/api/vk_dispatch.cpp index 9c8ae471..1f2a2468 100644 --- a/icd/api/vk_dispatch.cpp +++ b/icd/api/vk_dispatch.cpp @@ -791,6 +791,17 @@ void DispatchTable::Init() INIT_DISPATCH_ENTRY(vkGetShaderModuleIdentifierEXT ); INIT_DISPATCH_ENTRY(vkGetShaderModuleCreateInfoIdentifierEXT ); + INIT_DISPATCH_ENTRY(vkCreateIndirectCommandsLayoutNV ); + INIT_DISPATCH_ENTRY(vkDestroyIndirectCommandsLayoutNV ); + INIT_DISPATCH_ENTRY(vkGetGeneratedCommandsMemoryRequirementsNV ); + INIT_DISPATCH_ENTRY(vkCmdPreprocessGeneratedCommandsNV ); + INIT_DISPATCH_ENTRY(vkCmdExecuteGeneratedCommandsNV ); + INIT_DISPATCH_ENTRY(vkCmdBindPipelineShaderGroupNV ); + + INIT_DISPATCH_ENTRY(vkGetPipelineIndirectDeviceAddressNV ); + INIT_DISPATCH_ENTRY(vkGetPipelineIndirectMemoryRequirementsNV ); + INIT_DISPATCH_ENTRY(vkCmdUpdatePipelineIndirectBufferNV ); + INIT_DISPATCH_ENTRY(vkCmdSetTessellationDomainOriginEXT ); INIT_DISPATCH_ENTRY(vkCmdSetDepthClampEnableEXT ); INIT_DISPATCH_ENTRY(vkCmdSetPolygonModeEXT ); diff --git a/icd/api/vk_formats.cpp b/icd/api/vk_formats.cpp index d593e09b..39cc12b1 100644 --- a/icd/api/vk_formats.cpp +++ b/icd/api/vk_formats.cpp @@ -30,6 +30,7 @@ */ #include "include/vk_formats.h" #include "include/vk_conv.h" +#include "include/vk_physical_device.h" namespace vk { #if ( VKI_GPU_DECOMPRESS) @@ -321,4 +322,344 @@ Pal::Formats::NumericSupportFlags Formats::GetNumberFormat( return numType; } +// ===================================================================================================================== +// Individual planes of multi-planar formats are size-compatible with single-plane color formats if they occupy +// the same number of bits per texel block, and are compatible with those formats if they have the same block extent. +// See 34.1.1 Compatible Formats of Planes of Multi-Planar Formats +VkFormat Formats::GetCompatibleSinglePlaneFormat(VkFormat multiPlaneFormat, uint32_t planeIndex) +{ + VK_ASSERT(GetYuvPlaneCounts(multiPlaneFormat) > 1); + VkFormat singlePlaneFormat = VK_FORMAT_UNDEFINED; + + if (planeIndex < GetYuvPlaneCounts(multiPlaneFormat)) + { + // The conversion below is based on the table in 34.1.1. + // Individual planes of a multi-planar format are in turn format compatible with the listed single plane + // format's Format Compatability Classes (See 34.1.7). + switch (multiPlaneFormat) + { + case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM: + case VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM: + case VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM: + singlePlaneFormat = VK_FORMAT_R8_UNORM; + break; + case VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_420_UNORM_3PACK16: + case VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_422_UNORM_3PACK16: + case VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_444_UNORM_3PACK16: + singlePlaneFormat = VK_FORMAT_R10X6_UNORM_PACK16; + break; + case VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_420_UNORM_3PACK16: + case VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_422_UNORM_3PACK16: + case VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_444_UNORM_3PACK16: + singlePlaneFormat = VK_FORMAT_R12X4_UNORM_PACK16; + break; + case VK_FORMAT_G16_B16_R16_3PLANE_420_UNORM: + case VK_FORMAT_G16_B16_R16_3PLANE_422_UNORM: + case VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM: + singlePlaneFormat = VK_FORMAT_R16_UNORM; + break; + case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM: + case VK_FORMAT_G8_B8R8_2PLANE_422_UNORM: + case VK_FORMAT_G8_B8R8_2PLANE_444_UNORM: + singlePlaneFormat = (planeIndex == 0) ? + VK_FORMAT_R8_UNORM : + VK_FORMAT_R8G8_UNORM; + break; + case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16: + case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_422_UNORM_3PACK16: + case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_444_UNORM_3PACK16: + singlePlaneFormat = (planeIndex == 0) ? + VK_FORMAT_R10X6_UNORM_PACK16 : + VK_FORMAT_R10X6G10X6_UNORM_2PACK16; + break; + case VK_FORMAT_G12X4_B12X4R12X4_2PLANE_420_UNORM_3PACK16: + case VK_FORMAT_G12X4_B12X4R12X4_2PLANE_422_UNORM_3PACK16: + case VK_FORMAT_G12X4_B12X4R12X4_2PLANE_444_UNORM_3PACK16: + singlePlaneFormat = (planeIndex == 0) ? + VK_FORMAT_R12X4_UNORM_PACK16 : + VK_FORMAT_R12X4G12X4_UNORM_2PACK16; + break; + case VK_FORMAT_G16_B16R16_2PLANE_420_UNORM: + case VK_FORMAT_G16_B16R16_2PLANE_422_UNORM: + case VK_FORMAT_G16_B16R16_2PLANE_444_UNORM: + singlePlaneFormat = (planeIndex == 0) ? + VK_FORMAT_R16_UNORM : + VK_FORMAT_R16G16_UNORM; + break; + default: + break; + } + } + + return singlePlaneFormat; +} + +// ===================================================================================================================== +// Computes the extended feature set of a format when VK_IMAGE_CREATE_EXTENDED_USAGE_BIT is set +// NOTE: This function assumes the format that is passed in does not have +// Pal::Formats::PropertyFlags::BitCountInaccurate set +VkFormatFeatureFlags Formats::GetExtendedFeatureFlags( + const PhysicalDevice* pPhysicalDevice, + VkFormat format, + VkImageTiling tiling, + const RuntimeSettings& settings) +{ + VkFormatFeatureFlags extendedFeatures = 0; + Pal::SwizzledFormat palFormat = VkToPalFormat(format, settings); + + uint32 bitsPerPixel = Pal::Formats::BitsPerPixel(palFormat.format); + + // The following tables are from the Format Compatibility Classes section of the Vulkan specification. + static constexpr VkFormat Bpp8FormatClass[] = + { + VK_FORMAT_R4G4_UNORM_PACK8, + VK_FORMAT_R8_UNORM, + VK_FORMAT_R8_SNORM, + VK_FORMAT_R8_USCALED, + VK_FORMAT_R8_SSCALED, + VK_FORMAT_R8_UINT, + VK_FORMAT_R8_SINT, + VK_FORMAT_R8_SRGB + }; + + static constexpr VkFormat Bpp16FormatClass[] = + { + VK_FORMAT_R10X6_UNORM_PACK16, + VK_FORMAT_R12X4_UNORM_PACK16, + VK_FORMAT_A4R4G4B4_UNORM_PACK16, + VK_FORMAT_A4B4G4R4_UNORM_PACK16, + VK_FORMAT_R4G4B4A4_UNORM_PACK16, + VK_FORMAT_B4G4R4A4_UNORM_PACK16, + VK_FORMAT_R5G6B5_UNORM_PACK16, + VK_FORMAT_B5G6R5_UNORM_PACK16, + VK_FORMAT_R5G5B5A1_UNORM_PACK16, + VK_FORMAT_B5G5R5A1_UNORM_PACK16, + VK_FORMAT_A1R5G5B5_UNORM_PACK16, + VK_FORMAT_R8G8_UNORM, + VK_FORMAT_R8G8_SNORM, + VK_FORMAT_R8G8_USCALED, + VK_FORMAT_R8G8_SSCALED, + VK_FORMAT_R8G8_UINT, + VK_FORMAT_R8G8_SINT, + VK_FORMAT_R8G8_SRGB, + VK_FORMAT_R16_UNORM, + VK_FORMAT_R16_SNORM, + VK_FORMAT_R16_USCALED, + VK_FORMAT_R16_SSCALED, + VK_FORMAT_R16_UINT, + VK_FORMAT_R16_SINT, + VK_FORMAT_R16_SFLOAT + }; + + static constexpr VkFormat Bpp24FormatClass[] = + { + VK_FORMAT_R8G8B8_UNORM, + VK_FORMAT_R8G8B8_SNORM, + VK_FORMAT_R8G8B8_USCALED, + VK_FORMAT_R8G8B8_SSCALED, + VK_FORMAT_R8G8B8_UINT, + VK_FORMAT_R8G8B8_SINT, + VK_FORMAT_R8G8B8_SRGB, + VK_FORMAT_B8G8R8_UNORM, + VK_FORMAT_B8G8R8_SNORM, + VK_FORMAT_B8G8R8_USCALED, + VK_FORMAT_B8G8R8_SSCALED, + VK_FORMAT_B8G8R8_UINT, + VK_FORMAT_B8G8R8_SINT, + VK_FORMAT_B8G8R8_SRGB + }; + + static constexpr VkFormat Bpp32FormatClass[] = + { + VK_FORMAT_R10X6G10X6_UNORM_2PACK16, + VK_FORMAT_R12X4G12X4_UNORM_2PACK16, + VK_FORMAT_R8G8B8A8_UNORM, + VK_FORMAT_R8G8B8A8_SNORM, + VK_FORMAT_R8G8B8A8_USCALED, + VK_FORMAT_R8G8B8A8_SSCALED, + VK_FORMAT_R8G8B8A8_UINT, + VK_FORMAT_R8G8B8A8_SINT, + VK_FORMAT_R8G8B8A8_SRGB, + VK_FORMAT_B8G8R8A8_UNORM, + VK_FORMAT_B8G8R8A8_SNORM, + VK_FORMAT_B8G8R8A8_USCALED, + VK_FORMAT_B8G8R8A8_SSCALED, + VK_FORMAT_B8G8R8A8_UINT, + VK_FORMAT_B8G8R8A8_SINT, + VK_FORMAT_B8G8R8A8_SRGB, + VK_FORMAT_A8B8G8R8_UNORM_PACK32, + VK_FORMAT_A8B8G8R8_SNORM_PACK32, + VK_FORMAT_A8B8G8R8_USCALED_PACK32, + VK_FORMAT_A8B8G8R8_SSCALED_PACK32, + VK_FORMAT_A8B8G8R8_UINT_PACK32, + VK_FORMAT_A8B8G8R8_SINT_PACK32, + VK_FORMAT_A8B8G8R8_SRGB_PACK32, + VK_FORMAT_A2R10G10B10_UNORM_PACK32, + VK_FORMAT_A2R10G10B10_SNORM_PACK32, + VK_FORMAT_A2R10G10B10_USCALED_PACK32, + VK_FORMAT_A2R10G10B10_SSCALED_PACK32, + VK_FORMAT_A2R10G10B10_UINT_PACK32, + VK_FORMAT_A2R10G10B10_SINT_PACK32, + VK_FORMAT_A2B10G10R10_UNORM_PACK32, + VK_FORMAT_A2B10G10R10_SNORM_PACK32, + VK_FORMAT_A2B10G10R10_USCALED_PACK32, + VK_FORMAT_A2B10G10R10_SSCALED_PACK32, + VK_FORMAT_A2B10G10R10_UINT_PACK32, + VK_FORMAT_A2B10G10R10_SINT_PACK32, + VK_FORMAT_R16G16_UNORM, + VK_FORMAT_R16G16_SNORM, + VK_FORMAT_R16G16_USCALED, + VK_FORMAT_R16G16_SSCALED, + VK_FORMAT_R16G16_UINT, + VK_FORMAT_R16G16_SINT, + VK_FORMAT_R16G16_SFLOAT, + VK_FORMAT_R32_UINT, + VK_FORMAT_R32_SINT, + VK_FORMAT_R32_SFLOAT, + VK_FORMAT_B10G11R11_UFLOAT_PACK32, + VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 + }; + + static constexpr VkFormat Bpp48FormatClass[] = + { + VK_FORMAT_R16G16B16_UNORM, + VK_FORMAT_R16G16B16_SNORM, + VK_FORMAT_R16G16B16_USCALED, + VK_FORMAT_R16G16B16_SSCALED, + VK_FORMAT_R16G16B16_UINT, + VK_FORMAT_R16G16B16_SINT, + VK_FORMAT_R16G16B16_SFLOAT + }; + + static constexpr VkFormat Bpp64FormatClass[] = + { + VK_FORMAT_R16G16B16A16_UNORM, + VK_FORMAT_R16G16B16A16_SNORM, + VK_FORMAT_R16G16B16A16_USCALED, + VK_FORMAT_R16G16B16A16_SSCALED, + VK_FORMAT_R16G16B16A16_UINT, + VK_FORMAT_R16G16B16A16_SINT, + VK_FORMAT_R16G16B16A16_SFLOAT, + VK_FORMAT_R32G32_UINT, + VK_FORMAT_R32G32_SINT, + VK_FORMAT_R32G32_SFLOAT, + VK_FORMAT_R64_UINT, + VK_FORMAT_R64_SINT, + VK_FORMAT_R64_SFLOAT + }; + + static constexpr VkFormat Bpp96FormatClass[] = + { + VK_FORMAT_R32G32B32_UINT, + VK_FORMAT_R32G32B32_SINT, + VK_FORMAT_R32G32B32_SFLOAT + }; + + static constexpr VkFormat Bpp128FormatClass[] = + { + VK_FORMAT_R32G32B32A32_UINT, + VK_FORMAT_R32G32B32A32_SINT, + VK_FORMAT_R32G32B32A32_SFLOAT, + VK_FORMAT_R64G64_UINT, + VK_FORMAT_R64G64_SINT, + VK_FORMAT_R64G64_SFLOAT + }; + + static constexpr VkFormat Bpp192FormatClass[] = + { + VK_FORMAT_R64G64B64_UINT, + VK_FORMAT_R64G64B64_SINT, + VK_FORMAT_R64G64B64_SFLOAT + }; + + static constexpr VkFormat Bpp256FormatClass[] = + { + VK_FORMAT_R64G64B64A64_UINT, + VK_FORMAT_R64G64B64A64_SINT, + VK_FORMAT_R64G64B64A64_SFLOAT + }; + + // Depth images have no extended usage. + // YUV single and multiplanar images by themselves have no extended usage. To compute extended usage + // of a single plane of a multiplanar image call GetCompatibleSinglePlaneFormat and pass that format in. + // BC images allow conversion between UNORM|SRGB but there shouldn't be any difference in features. + bool noCompatibleExtendedUsage = Formats::IsDepthStencilFormat(format) || + Formats::IsYuvFormat(format) || + Pal::Formats::IsBlockCompressed(palFormat.format) || + (format == VK_FORMAT_UNDEFINED); + + if (noCompatibleExtendedUsage == false) + { + const VkFormat* pExtendedFormats = nullptr; + uint32_t extendedFormatCount = 0; + + switch (bitsPerPixel) + { + case 8: + pExtendedFormats = Bpp8FormatClass; + extendedFormatCount = sizeof(Bpp8FormatClass) / sizeof(VkFormat); + break; + case 16: + pExtendedFormats = Bpp16FormatClass; + extendedFormatCount = sizeof(Bpp16FormatClass) / sizeof(VkFormat); + break; + case 24: + pExtendedFormats = Bpp24FormatClass; + extendedFormatCount = sizeof(Bpp24FormatClass) / sizeof(VkFormat); + break; + case 32: + pExtendedFormats = Bpp32FormatClass; + extendedFormatCount = sizeof(Bpp32FormatClass) / sizeof(VkFormat); + break; + case 48: + pExtendedFormats = Bpp48FormatClass; + extendedFormatCount = sizeof(Bpp48FormatClass) / sizeof(VkFormat); + break; + case 64: + pExtendedFormats = Bpp64FormatClass; + extendedFormatCount = sizeof(Bpp64FormatClass) / sizeof(VkFormat); + break; + case 96: + pExtendedFormats = Bpp96FormatClass; + extendedFormatCount = sizeof(Bpp96FormatClass) / sizeof(VkFormat); + break; + case 128: + pExtendedFormats = Bpp128FormatClass; + extendedFormatCount = sizeof(Bpp128FormatClass) / sizeof(VkFormat); + break; + case 192: + pExtendedFormats = Bpp192FormatClass; + extendedFormatCount = sizeof(Bpp192FormatClass) / sizeof(VkFormat); + break; + case 256: + pExtendedFormats = Bpp256FormatClass; + extendedFormatCount = sizeof(Bpp256FormatClass) / sizeof(VkFormat); + break; + default: + VK_ALERT_ALWAYS_MSG("Unknown Format Class"); + } + + if ((extendedFormatCount > 0 && pExtendedFormats != nullptr)) + { + for (uint32_t i = 0; i < extendedFormatCount; ++i) + { + VkFormat extendedFormat = pExtendedFormats[i]; + + VkFormatProperties extendedFormatProperties = {}; + + VkResult result = pPhysicalDevice->GetFormatProperties(extendedFormat, &extendedFormatProperties); + if (result != VK_ERROR_FORMAT_NOT_SUPPORTED) + { + extendedFeatures |= (tiling == VK_IMAGE_TILING_OPTIMAL) ? + extendedFormatProperties.optimalTilingFeatures : + extendedFormatProperties.linearTilingFeatures; + } + + } + } + } + + return extendedFeatures; +} + } diff --git a/icd/api/vk_graphics_pipeline.cpp b/icd/api/vk_graphics_pipeline.cpp index 08595c79..5fc1f618 100644 --- a/icd/api/vk_graphics_pipeline.cpp +++ b/icd/api/vk_graphics_pipeline.cpp @@ -86,8 +86,7 @@ VkResult GraphicsPipeline::CreatePipelineBinaries( if (shouldCompile) { - bool skipCacheQuery = pDevice->GetRuntimeSettings().enablePipelineDump; - + bool skipCacheQuery = false; if (skipCacheQuery == false) { // Search the pipeline binary cache @@ -105,26 +104,71 @@ VkResult GraphicsPipeline::CreatePipelineBinaries( } } - // Compile if unable to retrieve from cache if (shouldCompile) { - if ((deviceIdx == DefaultDeviceIndex) || (pCreateInfo == nullptr)) + if ((pDevice->GetRuntimeSettings().ignoreFlagFailOnPipelineCompileRequired == false) && + (flags & VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR)) { - if (pCreateInfo != nullptr) - { - result = pDefaultCompiler->ConvertGraphicsPipelineInfo( - pDevice, - pCreateInfo, - extStructs, - flags, - pShaderInfo, - pPipelineLayout, - pPipelineOptimizerKey, - pBinaryMetadata, - pBinaryCreateInfo); - } + result = VK_PIPELINE_COMPILE_REQUIRED_EXT; + } + } + + bool shouldConvert = (pCreateInfo != nullptr) && + (pDevice->GetRuntimeSettings().enablePipelineDump || + (shouldCompile && (deviceIdx == DefaultDeviceIndex))); + + VkResult convertResult = VK_ERROR_UNKNOWN; + if (shouldConvert) + { + convertResult = pDefaultCompiler->ConvertGraphicsPipelineInfo( + pDevice, + pCreateInfo, + extStructs, + flags, + pShaderInfo, + pPipelineLayout, + pPipelineOptimizerKey, + pBinaryMetadata, + pBinaryCreateInfo); + result = (result == VK_SUCCESS) ? convertResult : result; + } + + if ((result == VK_SUCCESS) && (convertResult == VK_SUCCESS) && shouldCompile) + { + if (IsShaderModuleIdentifier(pBinaryCreateInfo->pipelineInfo.vs) || + IsShaderModuleIdentifier(pBinaryCreateInfo->pipelineInfo.gs) || + IsShaderModuleIdentifier(pBinaryCreateInfo->pipelineInfo.tcs) || + IsShaderModuleIdentifier(pBinaryCreateInfo->pipelineInfo.tes) || + IsShaderModuleIdentifier(pBinaryCreateInfo->pipelineInfo.fs) || + IsShaderModuleIdentifier(pBinaryCreateInfo->pipelineInfo.task) || + IsShaderModuleIdentifier(pBinaryCreateInfo->pipelineInfo.mesh)) + { + result = VK_ERROR_UNKNOWN; + } + } + + if (pDevice->GetRuntimeSettings().enablePipelineDump && (convertResult == VK_SUCCESS)) + { + if ((shouldCompile == false) || (result != VK_SUCCESS)) + { + Vkgc::PipelineBuildInfo pipelineInfo = {}; + pipelineInfo.pGraphicsInfo = &pBinaryCreateInfo->pipelineInfo; + pDefaultCompiler->DumpPipeline( + pDevice->GetRuntimeSettings(), + pipelineInfo, + pBinaryCreateInfo->apiPsoHash, + 1, + &pPipelineBinaries[deviceIdx], + result); + } + } - if (result == VK_SUCCESS) + // Compile if unable to retrieve from cache + if (shouldCompile) + { + if (result == VK_SUCCESS) + { + if ((deviceIdx == DefaultDeviceIndex) || (pCreateInfo == nullptr)) { result = pDefaultCompiler->CreateGraphicsPipelineBinary( pDevice, @@ -145,45 +189,45 @@ VkResult GraphicsPipeline::CreatePipelineBinaries( pBinaryCreateInfo->pBinaryMetadata); } } - } - else - { - GraphicsPipelineBinaryCreateInfo binaryCreateInfoMGPU = {}; - PipelineMetadata binaryMetadataMGPU = {}; - result = pDefaultCompiler->ConvertGraphicsPipelineInfo( - pDevice, - pCreateInfo, - extStructs, - flags, - pShaderInfo, - pPipelineLayout, - pPipelineOptimizerKey, - &binaryMetadataMGPU, - &binaryCreateInfoMGPU); - - if (result == VK_SUCCESS) + else { - result = pDevice->GetCompiler(deviceIdx)->CreateGraphicsPipelineBinary( + GraphicsPipelineBinaryCreateInfo binaryCreateInfoMGPU = {}; + PipelineMetadata binaryMetadataMGPU = {}; + result = pDefaultCompiler->ConvertGraphicsPipelineInfo( pDevice, - deviceIdx, - pPipelineCache, - &binaryCreateInfoMGPU, + pCreateInfo, + extStructs, flags, - &pPipelineBinaries[deviceIdx], - &pCacheIds[deviceIdx]); - } + pShaderInfo, + pPipelineLayout, + pPipelineOptimizerKey, + &binaryMetadataMGPU, + &binaryCreateInfoMGPU); - if (result == VK_SUCCESS) - { - result = PipelineCompiler::SetPipelineCreationFeedbackInfo( - pCreationFeedbackInfo, - pCreateInfo->stageCount, - pCreateInfo->pStages, - &binaryCreateInfoMGPU.pipelineFeedback, - binaryCreateInfoMGPU.stageFeedback); - } + if (result == VK_SUCCESS) + { + result = pDevice->GetCompiler(deviceIdx)->CreateGraphicsPipelineBinary( + pDevice, + deviceIdx, + pPipelineCache, + &binaryCreateInfoMGPU, + flags, + &pPipelineBinaries[deviceIdx], + &pCacheIds[deviceIdx]); + } + + if (result == VK_SUCCESS) + { + result = PipelineCompiler::SetPipelineCreationFeedbackInfo( + pCreationFeedbackInfo, + pCreateInfo->stageCount, + pCreateInfo->pStages, + &binaryCreateInfoMGPU.pipelineFeedback, + binaryCreateInfoMGPU.stageFeedback); + } - pDefaultCompiler->FreeGraphicsPipelineCreateInfo(pDevice, &binaryCreateInfoMGPU, false, false); + pDefaultCompiler->FreeGraphicsPipelineCreateInfo(pDevice, &binaryCreateInfoMGPU, false, false); + } } } else if (deviceIdx == DefaultDeviceIndex) @@ -545,16 +589,17 @@ static bool IsGplFastLinkPossible( // ===================================================================================================================== void DumpGplFastLinkInfo( - const Device* pDevice, - VkPipeline pipeline, - GraphicsPipelineBinaryCreateInfo* pCreateInfo) + const Device* pDevice, + VkPipeline pipeline, + const GraphicsPipelineBinaryCreateInfo& createInfo, + const GraphicsPipelineLibraryInfo& libInfo) { const GraphicsPipeline* pGraphicsPipeline = GraphicsPipeline::ObjectFromHandle(pipeline); const Pal::IPipeline* pPalPipeline = pGraphicsPipeline->GetPalPipeline(DefaultDeviceIndex); const Pal::PipelineInfo info = pPalPipeline->GetInfo(); const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); - uint64_t dumpHash = settings.dumpPipelineWithApiHash ? pCreateInfo->apiPsoHash : info.internalPipelineHash.stable; + uint64_t dumpHash = settings.dumpPipelineWithApiHash ? createInfo.apiPsoHash : info.internalPipelineHash.stable; Vkgc::PipelineDumpOptions dumpOptions = {}; dumpOptions.pDumpDir = settings.pipelineDumpDir; @@ -563,49 +608,66 @@ void DumpGplFastLinkInfo( dumpOptions.dumpDuplicatePipelines = settings.dumpDuplicatePipelines; Vkgc::PipelineBuildInfo pipelineInfo = {}; - pCreateInfo->pipelineInfo.unlinked = false; - pipelineInfo.pGraphicsInfo = &pCreateInfo->pipelineInfo; + pipelineInfo.pGraphicsInfo = &createInfo.pipelineInfo; void* pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump(&dumpOptions, pipelineInfo, dumpHash); if (pPipelineDumpHandle != nullptr) { - char extraInfo[256] = {}; + char preRasterFileName[Util::MaxFileNameStrLen] = {}; + char fragmentFileName[Util::MaxFileNameStrLen] = {}; + char colorExportFileName[Util::MaxFileNameStrLen] = {}; + + const GraphicsPipelineBinaryCreateInfo& preRasterCreateInfo = + libInfo.pPreRasterizationShaderLib->GetPipelineBinaryCreateInfo(); + const GraphicsPipelineBinaryCreateInfo& fragmentCreateInfo = + libInfo.pFragmentShaderLib->GetPipelineBinaryCreateInfo(); + + uint64_t preRasterHash = settings.dumpPipelineWithApiHash ? + preRasterCreateInfo.apiPsoHash : preRasterCreateInfo.libraryHash[GraphicsLibraryPreRaster]; + uint64_t fragmentHash = settings.dumpPipelineWithApiHash ? + fragmentCreateInfo.apiPsoHash : fragmentCreateInfo.libraryHash[GraphicsLibraryFragment]; + + Vkgc::IPipelineDumper::GetPipelineName(&preRasterCreateInfo.pipelineInfo, + preRasterFileName, Util::MaxFileNameStrLen, preRasterHash); + Vkgc::IPipelineDumper::GetPipelineName(&fragmentCreateInfo.pipelineInfo, + fragmentFileName, Util::MaxFileNameStrLen, fragmentHash); + + if (createInfo.pipelineInfo.enableColorExportShader) + { + uint64_t colorExportHash = settings.dumpPipelineWithApiHash ? + createInfo.apiPsoHash : createInfo.libraryHash[GraphicsLibraryColorExport]; + Vkgc::GraphicsPipelineBuildInfo colorExportInfo = {}; + colorExportInfo.unlinked = true; + Vkgc::IPipelineDumper::GetPipelineName(&colorExportInfo, + colorExportFileName, Util::MaxFileNameStrLen, colorExportHash); + } + + const char* fileNames[] = {preRasterFileName, fragmentFileName, colorExportFileName}; + Vkgc::IPipelineDumper::DumpGraphicsLibraryFileName(pPipelineDumpHandle, fileNames); + + char extraInfo[256] = {}; Util::Snprintf( extraInfo, sizeof(extraInfo), - "; ApiPsoHash: 0x%016" PRIX64 "\n", - pCreateInfo->apiPsoHash); + "\n; ApiPsoHash: 0x%016" PRIX64 "\n", + createInfo.apiPsoHash); Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, extraInfo); - for (uint32_t i = 0; i < GraphicsLibraryCount; i++) - { - if (pCreateInfo->pShaderLibraries[i] == nullptr) - { - continue; - } - const Pal::LibraryInfo& libInfo = pCreateInfo->pShaderLibraries[i]->GetInfo(); - Util::Snprintf( - extraInfo, - sizeof(extraInfo), - "; GraphicsPipelineLibrary Hash: 0x%016" PRIX64 "\n", - libInfo.internalLibraryHash.stable); - Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, extraInfo); - } for (uint32_t i = 0; i < GraphicsLibraryCount; i++) { - if (pCreateInfo->pShaderLibraries[i] == nullptr) + if (createInfo.pShaderLibraries[i] == nullptr) { continue; } uint32_t codeSize = 0; - Pal::Result result = pCreateInfo->pShaderLibraries[i]->GetCodeObject(&codeSize, nullptr); + Pal::Result result = createInfo.pShaderLibraries[i]->GetCodeObject(&codeSize, nullptr); if ((codeSize > 0) && (result == Pal::Result::Success)) { void* pCode = pDevice->VkInstance()->AllocMem(codeSize, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (pCode != nullptr) { - result = pCreateInfo->pShaderLibraries[i]->GetCodeObject(&codeSize, pCode); + result = createInfo.pShaderLibraries[i]->GetCodeObject(&codeSize, pCode); VK_ASSERT(result == Pal::Result::Success); Vkgc::BinaryData libraryBinary = {}; @@ -619,8 +681,9 @@ void DumpGplFastLinkInfo( } } - PipelineCompiler::DumpPipelineMetadata(pPipelineDumpHandle, pCreateInfo->pBinaryMetadata); + PipelineCompiler::DumpPipelineMetadata(pPipelineDumpHandle, createInfo.pBinaryMetadata); + Vkgc::IPipelineDumper::DumpPipelineExtraInfo(pPipelineDumpHandle, "\n;CompileResult=FastLinkSuccess\n"); Vkgc::IPipelineDumper::EndPipelineDump(pPipelineDumpHandle); } } @@ -650,7 +713,7 @@ VkResult GraphicsPipeline::Create( GraphicsPipelineObjectCreateInfo objectCreateInfo = {}; GraphicsPipelineShaderStageInfo shaderStageInfo = {}; PipelineOptimizerKey pipelineOptimizerKey = {}; - uint64_t apiPsoHash = {}; + uint64_t apiPsoHash = 0; Util::MetroHash::Hash elfHash = {}; PipelineMetadata binaryMetadata = {}; PipelineLayout* pPipelineLayout = nullptr; @@ -671,7 +734,6 @@ VkResult GraphicsPipeline::Create( // 1. Check whether GPL fast link is possible if (pDevice->GetRuntimeSettings().useShaderLibraryForPipelineLibraryFastLink) { - // If pipeline only contains PreRasterizationShaderLib and no fragment shader is in the create info, // we add a null fragment library in order to use fast link. if ((libInfo.flags.isLibrary == false) && @@ -763,7 +825,6 @@ VkResult GraphicsPipeline::Create( &shaderOptimizerKeys[0], &pipelineOptimizerKey, &apiPsoHash, - //&elfHash, &tempModules[0], &cacheId[0]); @@ -921,7 +982,7 @@ VkResult GraphicsPipeline::Create( if (enableFastLink && pDevice->GetRuntimeSettings().enablePipelineDump) { - DumpGplFastLinkInfo(pDevice, *pPipeline, &binaryCreateInfo); + DumpGplFastLinkInfo(pDevice, *pPipeline, binaryCreateInfo, libInfo); } } @@ -1599,7 +1660,7 @@ VkResult GraphicsPipeline::Destroy( { if (m_deferWorkload.pEvent != nullptr) { - auto result = m_deferWorkload.pEvent->Wait(10); + auto result = m_deferWorkload.pEvent->Wait(Util::fseconds{ 10 }); if (result == Util::Result::Success) { Util::Destructor(m_deferWorkload.pEvent); diff --git a/icd/api/vk_graphics_pipeline_library.cpp b/icd/api/vk_graphics_pipeline_library.cpp index 55bb7568..8ce4eb75 100644 --- a/icd/api/vk_graphics_pipeline_library.cpp +++ b/icd/api/vk_graphics_pipeline_library.cpp @@ -352,17 +352,26 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary( continue; } + if ((GetVkGraphicsLibraryFlagBit(pShaderStageInfo->stages[i].stage) ^ pLibInfo->libFlags) != 0) + { + continue; + } + if (canBuildShader) { - // We don't take care of the result. Early compile failure in some cases is expected - pCompiler->CreateGraphicsShaderBinary( + result = pCompiler->CreateGraphicsShaderBinary( pDevice, pPipelineCache, gplType, pBinaryCreateInfo, &pTempModuleStages[i]); gplMask |= (1 << gplType); } + + if (result != VK_SUCCESS) + { + break; + } } } - if (pLibInfo->flags.optimize) + if ((result == VK_SUCCESS) && pLibInfo->flags.optimize) { // We need to re-compile some stage if related new state is available if ((pLibInfo->libFlags & VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) && @@ -405,17 +414,20 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary( palElfBinary = pCompiler->GetSolution(pBinaryCreateInfo->compilerType)-> ExtractPalElfBinary(pBinaryCreateInfo->earlyElfPackage[gplType]); - result = pCompiler->CreateGraphicsShaderLibrary(pDevice, - palElfBinary, - pAllocator, - &pBinaryCreateInfo->pShaderLibraries[gplType]); - pBinaryCreateInfo->earlyElfPackage[gplType].pCode = nullptr; - } - - if (pTempModuleStages[stage].elfPackage.codeSize > 0) - { - pDevice->VkInstance()->FreeMem(const_cast(pTempModuleStages[stage].elfPackage.pCode)); - pTempModuleStages[stage].elfPackage = {}; + if (palElfBinary.codeSize > 0) + { + result = pCompiler->CreateGraphicsShaderLibrary(pDevice, + palElfBinary, + pAllocator, + &pBinaryCreateInfo->pShaderLibraries[gplType]); + pBinaryCreateInfo->earlyElfPackage[gplType].pCode = nullptr; + + if (pTempModuleStages[stage].elfPackage.codeSize > 0) + { + pDevice->VkInstance()->FreeMem(const_cast(pTempModuleStages[stage].elfPackage.pCode)); + pTempModuleStages[stage].elfPackage = {}; + } + } } } @@ -693,11 +705,11 @@ GraphicsPipelineLibrary::GraphicsPipelineLibrary( const uint64_t apiHash, const GplModuleState* pGplModuleStates, const PipelineLayout* pPipelineLayout) + : GraphicsPipelineCommon( #if VKI_RAY_TRACING - : GraphicsPipelineCommon(false, pDevice), -#else - : GraphicsPipelineCommon(pDevice), + false, #endif + pDevice), m_objectCreateInfo(objectInfo), m_pBinaryCreateInfo(pBinaryInfo), m_libInfo(libInfo), diff --git a/icd/api/vk_indirect_commands_layout.cpp b/icd/api/vk_indirect_commands_layout.cpp new file mode 100644 index 00000000..0a84dd44 --- /dev/null +++ b/icd/api/vk_indirect_commands_layout.cpp @@ -0,0 +1,382 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file vk_indirect_commands_layout.cpp + * @brief Contains implementation of Vulkan indirect commands layout objects. + *********************************************************************************************************************** + */ + +#include "include/vk_indirect_commands_layout.h" +#include "include/vk_buffer.h" +#include "include/vk_conv.h" + +namespace vk +{ +// ===================================================================================================================== +// Creates an indirect commands layout object. +VkResult IndirectCommandsLayout::Create( + const Device* pDevice, + const VkIndirectCommandsLayoutCreateInfoNV* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkIndirectCommandsLayoutNV* pLayout) +{ + VkResult result = VK_SUCCESS; + Pal::Result palResult; + + Pal::IndirectCmdGeneratorCreateInfo createInfo = {}; + Pal::IndirectParam indirectParams[MaxIndirectTokenCount] = {}; + createInfo.pParams = &indirectParams[0]; + + Pal::IIndirectCmdGenerator* pGenerators[MaxPalDevices] = {}; + + const size_t apiSize = ObjectSize(pDevice); + size_t totalSize = apiSize; + size_t palSize = 0; + + void* pMemory = nullptr; + + IndirectCommandsInfo info = {}; + + VK_ASSERT(pCreateInfo->streamCount == 1); + VK_ASSERT(pCreateInfo->tokenCount > 0); + VK_ASSERT(pCreateInfo->tokenCount <= MaxIndirectTokenCount); + + if (pCreateInfo->tokenCount == 1) + { + VK_NOT_IMPLEMENTED; + } + + const VkIndirectCommandsLayoutTokenNV lastToken = pCreateInfo->pTokens[pCreateInfo->tokenCount - 1]; + + switch (lastToken.tokenType) + { + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV: + info.actionType = IndirectCommandsActionType::Draw; + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV: + info.actionType = IndirectCommandsActionType::DrawIndexed; + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NV: + info.actionType = IndirectCommandsActionType::Dispatch; + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_MESH_TASKS_NV: + info.actionType = IndirectCommandsActionType::MeshTask; + break; + + default: + VK_ALERT_ALWAYS_MSG("Indirect tokens can only end up with one type of actions."); + result = VK_ERROR_UNKNOWN; + break; + } + + if (result == VK_SUCCESS) + { + BuildPalCreateInfo(pDevice, pCreateInfo, &indirectParams[0], &createInfo); + + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) + { + const size_t size = pDevice->PalDevice(deviceIdx)->GetIndirectCmdGeneratorSize(createInfo, + &palResult); + if (palResult == Pal::Result::Success) + { + palSize += size; + } + else + { + result = PalToVkResult(palResult); + break; + } + } + + totalSize += palSize; + } + + if (result == VK_SUCCESS) + { + pMemory = pDevice->AllocApiObject(pAllocator, totalSize); + if (pMemory == nullptr) + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + } + + if (result == VK_SUCCESS) + { + void* pPalMemory = Util::VoidPtrInc(pMemory, apiSize); + + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) + { + const size_t size = pDevice->PalDevice(deviceIdx)->GetIndirectCmdGeneratorSize(createInfo, + &palResult); + + if (palResult == Pal::Result::Success) + { + palResult = pDevice->PalDevice(deviceIdx)->CreateIndirectCmdGenerator(createInfo, + pPalMemory, + &pGenerators[deviceIdx]); + } + + if (palResult == Pal::Result::Success) + { + pPalMemory = Util::VoidPtrInc(pPalMemory, size); + } + else + { + result = PalToVkResult(palResult); + break; + } + } + } + + if (result == VK_SUCCESS) + { + VK_PLACEMENT_NEW(pMemory) IndirectCommandsLayout( + pDevice, + info, + pGenerators, + createInfo); + + *pLayout = IndirectCommandsLayout::HandleFromVoidPointer(pMemory); + } + + return result; +} + +// ===================================================================================================================== +IndirectCommandsLayout::IndirectCommandsLayout( + const Device* pDevice, + const IndirectCommandsInfo& info, + Pal::IIndirectCmdGenerator** pPalGenerator, + const Pal::IndirectCmdGeneratorCreateInfo& palCreateInfo) + : + m_info(info), + m_palCreateInfo(palCreateInfo) +{ + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) + { + m_perGpu[deviceIdx].pGenerator = pPalGenerator[deviceIdx]; + m_perGpu[deviceIdx].preprocessBufferVirtAddr = 0; + } +} + +// ===================================================================================================================== +void IndirectCommandsLayout::BuildPalCreateInfo( + const Device* pDevice, + const VkIndirectCommandsLayoutCreateInfoNV* pCreateInfo, + Pal::IndirectParam* pIndirectParams, + Pal::IndirectCmdGeneratorCreateInfo* pPalCreateInfo) +{ + uint32_t bindingArgsSize = 0; + + const bool isDispatch = (pCreateInfo->pTokens[pCreateInfo->tokenCount - 1].tokenType + == VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NV); + + for (uint32_t i = 0; i < pCreateInfo->tokenCount; ++i) + { + const VkIndirectCommandsLayoutTokenNV& token = pCreateInfo->pTokens[i]; + + switch (token.tokenType) + { + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV: + pIndirectParams[i].type = Pal::IndirectParamType::Draw; + pIndirectParams[i].sizeInBytes = sizeof(Pal::DrawIndirectArgs); + static_assert(sizeof(Pal::DrawIndirectArgs) == sizeof(VkDrawIndirectCommand)); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV: + pIndirectParams[i].type = Pal::IndirectParamType::DrawIndexed; + pIndirectParams[i].sizeInBytes = sizeof(Pal::DrawIndexedIndirectArgs); + static_assert(sizeof(Pal::DrawIndexedIndirectArgs) == sizeof(VkDrawIndexedIndirectCommand)); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NV: + pIndirectParams[i].type = Pal::IndirectParamType::Dispatch; + pIndirectParams[i].sizeInBytes = sizeof(Pal::DispatchIndirectArgs); + static_assert(sizeof(Pal::DispatchIndirectArgs) == sizeof(VkDispatchIndirectCommand)); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV: + pIndirectParams[i].type = Pal::IndirectParamType::BindIndexData; + pIndirectParams[i].sizeInBytes = sizeof(Pal::BindIndexDataIndirectArgs); + static_assert(sizeof(Pal::BindIndexDataIndirectArgs) == sizeof(VkBindIndexBufferIndirectCommandNV)); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV: + pIndirectParams[i].type = Pal::IndirectParamType::BindVertexData; + pIndirectParams[i].sizeInBytes = sizeof(Pal::BindVertexDataIndirectArgs); + pIndirectParams[i].vertexData.bufferId = token.vertexBindingUnit; + pIndirectParams[i].userDataShaderUsage = Pal::ApiShaderStageVertex; + static_assert(sizeof(Pal::BindVertexDataIndirectArgs) == sizeof(VkBindVertexBufferIndirectCommandNV)); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_MESH_TASKS_NV: + pIndirectParams[i].type = Pal::IndirectParamType::DispatchMesh; + pIndirectParams[i].sizeInBytes = sizeof(Pal::DispatchMeshIndirectArgs); + static_assert(sizeof(Pal::DispatchMeshIndirectArgs) == sizeof(VkDrawMeshTasksIndirectCommandEXT)); + break; + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV: + { + const PipelineLayout* pPipelineLayout = PipelineLayout::ObjectFromHandle(token.pushconstantPipelineLayout); + const UserDataLayout& userDataLayout = pPipelineLayout->GetInfo().userDataLayout; + + if (userDataLayout.scheme == PipelineLayoutScheme::Indirect) + { + VK_NOT_IMPLEMENTED; + } + + uint32_t startInDwords = token.pushconstantOffset / sizeof(uint32_t); + uint32_t lengthInDwords = PipelineLayout::GetPushConstantSizeInDword(token.pushconstantSize); + + pIndirectParams[i].type = Pal::IndirectParamType::SetUserData; + pIndirectParams[i].userData.entryCount = lengthInDwords; + pIndirectParams[i].sizeInBytes = sizeof(uint32_t) * lengthInDwords; + pIndirectParams[i].userData.firstEntry = userDataLayout.compact.pushConstRegBase + startInDwords; + pIndirectParams[i].userDataShaderUsage = VkToPalShaderStageMask(token.pushconstantShaderStageFlags); + break; + } + + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_SHADER_GROUP_NV: + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_STATE_FLAGS_NV: + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_TASKS_NV: + case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PIPELINE_NV: + VK_NOT_IMPLEMENTED; + break; + + default: + VK_NEVER_CALLED(); + break; + + } + + if (i < (pCreateInfo->tokenCount - 1)) + { + bindingArgsSize += pIndirectParams[i].sizeInBytes; + } + } + + for (uint32_t i = 0; i < pCreateInfo->streamCount; ++i) + { + uint32_t stride = pCreateInfo->pStreamStrides[i]; + pPalCreateInfo->strideInBytes += stride; + } + + pPalCreateInfo->paramCount = pCreateInfo->tokenCount; + + // Override userDataShaderUsage to compute shader only for dispatch type + if (isDispatch) + { + for (uint32_t i = 0; i < pPalCreateInfo->paramCount; ++i) + { + pIndirectParams[i].userDataShaderUsage = Pal::ShaderStageFlagBits::ApiShaderStageCompute; + } + } +} + +// ===================================================================================================================== +void IndirectCommandsLayout::CalculateMemoryRequirements( + const Device* pDevice, + VkMemoryRequirements2* pMemoryRequirements + ) const +{ + VK_ASSERT(m_perGpu[DefaultDeviceIndex].pGenerator != nullptr); + Pal::GpuMemoryRequirements memReqs = {}; + m_perGpu[DefaultDeviceIndex].pGenerator->GetGpuMemoryRequirements(&memReqs); + +#if DEBUG + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) + { + VK_ASSERT(m_perGpu[deviceIdx].pGenerator != nullptr); + + if (deviceIdx != DefaultDeviceIndex) + { + Pal::GpuMemoryRequirements deviceReqs = {}; + m_perGpu[deviceIdx].pGenerator->GetGpuMemoryRequirements(&deviceReqs); + VK_ASSERT(memcmp(&memReqs, &deviceReqs, sizeof(deviceReqs)) == 0); + } + } +#endif + + pMemoryRequirements->memoryRequirements.alignment = memReqs.alignment; + pMemoryRequirements->memoryRequirements.size = memReqs.size; + + pMemoryRequirements->memoryRequirements.memoryTypeBits = 0; + + for (uint32_t i = 0; i < memReqs.heapCount; ++i) + { + uint32_t typeIndexBits; + + if (pDevice->GetVkTypeIndexBitsFromPalHeap(memReqs.heaps[i], &typeIndexBits)) + { + pMemoryRequirements->memoryRequirements.memoryTypeBits |= typeIndexBits; + } + } +} + +// ===================================================================================================================== +void IndirectCommandsLayout::BindPreprocessBuffer( + VkBuffer buffer, + VkDeviceSize memOffset, + uint32_t deviceIdx) +{ + Buffer* pBuffer = Buffer::ObjectFromHandle(buffer); + Pal::gpusize bufferVirtAddr = pBuffer->PalMemory(deviceIdx)->Desc().gpuVirtAddr + memOffset; + + if (m_perGpu[deviceIdx].preprocessBufferVirtAddr != bufferVirtAddr) + { + Pal::Result palResult = m_perGpu[deviceIdx].pGenerator->BindGpuMemory(pBuffer->PalMemory(deviceIdx), + memOffset); + VK_ASSERT(palResult == Pal::Result::Success); + m_perGpu[deviceIdx].preprocessBufferVirtAddr = bufferVirtAddr; + } +} + +// ===================================================================================================================== +VkResult IndirectCommandsLayout::Destroy( + Device* pDevice, + const VkAllocationCallbacks* pAllocator) +{ + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) + { + if (m_perGpu[deviceIdx].pGenerator != nullptr) + { + m_perGpu[deviceIdx].pGenerator->Destroy(); + } + // It's app's reponsibility to free the preprocess buffer. + m_perGpu[deviceIdx].preprocessBufferVirtAddr = 0; + } + + Util::Destructor(this); + + pDevice->FreeApiObject(pAllocator, this); + + return VK_SUCCESS; +} + +} // namespace vk diff --git a/icd/api/vk_instance.cpp b/icd/api/vk_instance.cpp index cca1b3cc..98b14635 100644 --- a/icd/api/vk_instance.cpp +++ b/icd/api/vk_instance.cpp @@ -46,6 +46,8 @@ #if ICD_GPUOPEN_DEVMODE_BUILD #include "devmode/devmode_mgr.h" +#include "devmode/devmode_rgp.h" +#include "devmode/devmode_ubertrace.h" #endif #include "res/ver.h" @@ -90,7 +92,7 @@ Instance::Instance( m_preInitAppProfile(preInitProfile), m_screenCount(0), m_pScreenStorage(nullptr), - m_pDevModeMgr(nullptr), + m_pDevMode(nullptr), m_debugReportCallbacks(&m_palAllocator), m_debugUtilsMessengers(&m_palAllocator), m_logTagIdMask(0), @@ -439,7 +441,7 @@ VkResult Instance::Init( // Late-initialize the developer mode manager. Needs to be called after settings are committed but BEFORE // physical devices are late-initialized (below). - if ((status == VK_SUCCESS) && (m_pDevModeMgr != nullptr)) + if ((status == VK_SUCCESS) && (m_pDevMode != nullptr)) { DevModeLateInitialize(); } @@ -549,12 +551,10 @@ VkResult Instance::Init( InitDispatchTable(); -#if DEBUG // Optionally wait for a debugger to be attached utils::WaitIdleForDebugger(pPhysicalDevice->GetRuntimeSettings().waitForDebugger, &pPhysicalDevice->GetRuntimeSettings().waitForDebuggerExecutableName[0], pPhysicalDevice->GetRuntimeSettings().debugTimeout); -#endif } return status; @@ -630,9 +630,9 @@ VkResult Instance::LoadAndCommitSettings( #if ICD_GPUOPEN_DEVMODE_BUILD // Inform developer mode manager of settings. This also finalizes the developer mode manager. - if (m_pDevModeMgr != nullptr) + if (m_pDevMode != nullptr) { - m_pDevModeMgr->Finalize(deviceCount, settingsLoaders); + m_pDevMode->Finalize(deviceCount, settingsLoaders); } #endif @@ -671,8 +671,8 @@ VkResult Instance::Destroy(void) AmdvlkLog(m_logTagIdMask, GeneralPrint, "%s End ********\n", GetApplicationName()); #if ICD_GPUOPEN_DEVMODE_BUILD - // Pipeline binary cache is required to be freed before destroying DevModeMgr - // because DevModeMgr manages the state of pipeline binary cache. + // Pipeline binary cache is required to be freed before destroying DevMode + // because DevMode manages the state of pipeline binary cache. uint32_t deviceCount = PhysicalDeviceManager::MaxPhysicalDevices; VkPhysicalDevice devices[PhysicalDeviceManager::MaxPhysicalDevices] = {}; m_pPhysicalDeviceManager->EnumeratePhysicalDevices(&deviceCount, devices); @@ -681,9 +681,9 @@ VkResult Instance::Destroy(void) ApiPhysicalDevice::ObjectFromHandle(devices[deviceIdx])->GetCompiler()->DestroyPipelineBinaryCache(); } - if (m_pDevModeMgr != nullptr) + if (m_pDevMode != nullptr) { - m_pDevModeMgr->Destroy(); + m_pDevMode->Destroy(); } #endif @@ -1049,12 +1049,24 @@ void Instance::DevModeEarlyInitialize() { #if ICD_GPUOPEN_DEVMODE_BUILD VK_ASSERT(m_pPhysicalDeviceManager == nullptr); - VK_ASSERT(m_pDevModeMgr == nullptr); + VK_ASSERT(m_pDevMode == nullptr); // Initialize the devmode manager which abstracts interaction with the gpuopen dev driver component if (m_pPalPlatform->GetDevDriverServer() != nullptr) { - const VkResult result = DevModeMgr::Create(this, &m_pDevModeMgr); + VkResult result; + +#if PAL_BUILD_RDF + if ((m_pPalPlatform->GetTraceSession() != nullptr) && + (m_pPalPlatform->GetTraceSession()->IsTracingEnabled())) + { + result = DevModeUberTrace::Create(this, reinterpret_cast(&m_pDevMode)); + } + else +#endif + { + result = DevModeRgp::Create(this, reinterpret_cast(&m_pDevMode)); + } VK_ASSERT(result == VK_SUCCESS); } @@ -1068,16 +1080,16 @@ void Instance::DevModeLateInitialize() { #if ICD_GPUOPEN_DEVMODE_BUILD VK_ASSERT(m_pPhysicalDeviceManager != nullptr); - VK_ASSERT(m_pDevModeMgr != nullptr); + VK_ASSERT(m_pDevMode != nullptr); // Query if we need support for SQTT tracing, and notify the instance so that the correct dispatch table // layer can be installed. - if (m_pDevModeMgr->IsTracingEnabled()) + if (m_pDevMode->IsTracingEnabled()) { EnableTracingSupport(); } - if (m_pDevModeMgr->IsCrashAnalysisEnabled()) + if (m_pDevMode->IsCrashAnalysisEnabled()) { EnableCrashAnalysisSupport(); } diff --git a/icd/api/vk_physical_device.cpp b/icd/api/vk_physical_device.cpp index 9f738431..75fdd9cf 100644 --- a/icd/api/vk_physical_device.cpp +++ b/icd/api/vk_physical_device.cpp @@ -42,6 +42,7 @@ #include "include/vk_utils.h" #include "include/vk_conv.h" #include "include/vk_surface.h" +#include "include/vk_indirect_commands_layout.h" #include "include/khronos/vk_icd.h" @@ -2079,6 +2080,27 @@ VkResult PhysicalDevice::GetImageFormatProperties( : formatProperties.linearTilingFeatures; } + // If extended usage was set, compute the extended feature set based on the type + // of image and the format compatibility classes. + if (Util::TestAnyFlagSet(flags, VK_IMAGE_CREATE_EXTENDED_USAGE_BIT)) + { + if (Formats::IsYuvFormat(format) && (Formats::GetYuvPlaneCounts(format) > 1)) + { + VkFormat singlePlaneFormat = VK_FORMAT_UNDEFINED; + + for (uint32_t i = 0; i < Formats::GetYuvPlaneCounts(format); ++i) + { + singlePlaneFormat = Formats::GetCompatibleSinglePlaneFormat(format, i); + + supportedFeatures |= Formats::GetExtendedFeatureFlags(this, singlePlaneFormat, tiling, settings); + } + } + else + { + supportedFeatures |= Formats::GetExtendedFeatureFlags(this, format, tiling, settings); + } + } + // 3D textures with depth or stencil format are not supported if ((type == VK_IMAGE_TYPE_3D) && (Formats::HasDepth(format) || Formats::HasStencil(format))) { @@ -2116,14 +2138,7 @@ VkResult PhysicalDevice::GetImageFormatProperties( (((usage & VK_IMAGE_USAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR) != 0) && ((supportedFeatures & VK_FORMAT_FEATURE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR) == 0))) { - // If extended usage was set ignore the error. We do not know what format or usage is intended. - // However for Yuv and Depth images that do not have any compatible formats, report error always. - if (((flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT) == 0 )|| - Formats::IsYuvFormat(format) || - Formats::IsDepthStencilFormat(format)) - { - return VK_ERROR_FORMAT_NOT_SUPPORTED; - } + return VK_ERROR_FORMAT_NOT_SUPPORTED; } // Calculate maxResourceSize @@ -2582,11 +2597,11 @@ VkResult PhysicalDevice::GetPhysicalDeviceToolPropertiesEXT( bool isProfilingEnabled = false; VkResult result = VK_SUCCESS; - DevModeMgr* devModeMgr = VkInstance()->GetDevModeMgr(); + IDevMode* devMode = VkInstance()->GetDevModeMgr(); - if (devModeMgr != nullptr) + if (devMode != nullptr) { - isProfilingEnabled = devModeMgr->IsTracingEnabled(); + isProfilingEnabled = devMode->IsTracingEnabled(); } if (pToolProperties == nullptr) @@ -4382,6 +4397,12 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions( availableExtensions.AddExtension(VK_DEVICE_EXTENSION(NV_COMPUTE_SHADER_DERIVATIVES)); } + if ((pPhysicalDevice == nullptr) || (pPhysicalDevice->GetRuntimeSettings().exportNvDeviceGeneratedCommands)) + { + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(NV_DEVICE_GENERATED_COMMANDS)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(NV_DEVICE_GENERATED_COMMANDS_COMPUTE)); + } + if ((pPhysicalDevice == nullptr) || (pPhysicalDevice->GetRuntimeSettings().enableGraphicsPipelineLibraries)) { availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_GRAPHICS_PIPELINE_LIBRARY)); @@ -4415,6 +4436,8 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions( availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_NON_SEAMLESS_CUBE_MAP)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_SHADER_MODULE_IDENTIFIER)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_MAXIMAL_RECONVERGENCE)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_EXTENDED_DYNAMIC_STATE3)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_VERTEX_INPUT_DYNAMIC_STATE)); @@ -4423,9 +4446,13 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions( availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_SUBGROUP_ROTATE)); - availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_FLOAT_CONTROLS2)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_FLOAT_CONTROLS2)); + + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_QUAD_CONTROL)); + + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_NESTED_COMMAND_BUFFER)); - availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_QUAD_CONTROL)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_DYNAMIC_RENDERING_LOCAL_READ)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_VERTEX_ATTRIBUTE_DIVISOR)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_INDEX_TYPE_UINT8)); @@ -5086,19 +5113,20 @@ void PhysicalDevice::GetPhysicalDeviceDotProduct8Properties( *pIntegerDotProduct8BitUnsignedAccelerated = int8DotSupport; *pIntegerDotProduct8BitSignedAccelerated = int8DotSupport; - *pIntegerDotProductAccumulatingSaturating8BitUnsignedAccelerated = VK_FALSE; - *pIntegerDotProductAccumulatingSaturating8BitSignedAccelerated = VK_FALSE; - *pIntegerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = VK_FALSE; + *pIntegerDotProductAccumulatingSaturating8BitUnsignedAccelerated = int8DotSupport; + *pIntegerDotProductAccumulatingSaturating8BitSignedAccelerated = int8DotSupport; #if VKI_BUILD_GFX11 if (PalProperties().gfxLevel >= Pal::GfxIpLevel::GfxIp11_0) { *pIntegerDotProduct8BitMixedSignednessAccelerated = VK_TRUE; + *pIntegerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = VK_TRUE; } else #endif { *pIntegerDotProduct8BitMixedSignednessAccelerated = VK_FALSE; + *pIntegerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = VK_FALSE; } } @@ -5117,19 +5145,20 @@ void PhysicalDevice::GetPhysicalDeviceDotProduct4x8Properties( *pIntegerDotProduct4x8BitPackedUnsignedAccelerated = int8DotSupport; *pIntegerDotProduct4x8BitPackedSignedAccelerated = int8DotSupport; - *pIntegerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = VK_FALSE; - *pIntegerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = VK_FALSE; - *pIntegerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = VK_FALSE; + *pIntegerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = int8DotSupport; + *pIntegerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = int8DotSupport; #if VKI_BUILD_GFX11 if (PalProperties().gfxLevel >= Pal::GfxIpLevel::GfxIp11_0) { *pIntegerDotProduct4x8BitPackedMixedSignednessAccelerated = VK_TRUE; + *pIntegerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = VK_TRUE; } else #endif { *pIntegerDotProduct4x8BitPackedMixedSignednessAccelerated = VK_FALSE; + *pIntegerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = VK_FALSE; } } @@ -7346,6 +7375,46 @@ size_t PhysicalDevice::GetFeatures2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEVICE_GENERATED_COMMANDS_FEATURES_NV: + { + auto* pExtInfo = reinterpret_cast(pHeader); + if (updateFeatures) + { + pExtInfo->deviceGeneratedCommands = VK_TRUE; + } + + structSize = sizeof(*pExtInfo); + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEVICE_GENERATED_COMMANDS_COMPUTE_FEATURES_NV: + { + auto* pExtInfo = reinterpret_cast(pHeader); + if (updateFeatures) + { + pExtInfo->deviceGeneratedCompute = VK_TRUE; + pExtInfo->deviceGeneratedComputePipelines = VK_FALSE; + pExtInfo->deviceGeneratedComputeCaptureReplay = VK_FALSE; + } + + structSize = sizeof(*pExtInfo); + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_NESTED_COMMAND_BUFFER_FEATURES_EXT: + { + auto* pExtInfo = reinterpret_cast(pHeader); + if (updateFeatures) + { + pExtInfo->nestedCommandBuffer = VK_TRUE; + pExtInfo->nestedCommandBufferRendering = VK_TRUE; + pExtInfo->nestedCommandBufferSimultaneousUse = VK_FALSE; + } + + structSize = sizeof(*pExtInfo); + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAME_BOUNDARY_FEATURES_EXT: { auto* pExtInfo = reinterpret_cast(pHeader); @@ -8331,6 +8400,28 @@ void PhysicalDevice::GetDeviceProperties2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEVICE_GENERATED_COMMANDS_PROPERTIES_NV: + { + auto* pProps = static_cast(pNext); + pProps->maxIndirectCommandsStreamCount = 1; + pProps->maxIndirectCommandsStreamStride = UINT32_MAX; + pProps->maxIndirectCommandsTokenCount = MaxIndirectTokenCount; + pProps->maxIndirectCommandsTokenOffset = MaxIndirectTokenOffset; + pProps->minIndirectCommandsBufferOffsetAlignment = 4; + pProps->minSequencesCountBufferOffsetAlignment = 4; + pProps->minSequencesIndexBufferOffsetAlignment = 4; + pProps->maxGraphicsShaderGroupCount = 0; + pProps->maxIndirectSequenceCount = UINT32_MAX >> 1; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_NESTED_COMMAND_BUFFER_PROPERTIES_EXT: + { + auto* pProps = static_cast(pNext); + pProps->maxCommandBufferNestingLevel = 1; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_KHR: { auto* pProps = static_cast(pNext); diff --git a/icd/api/vk_physical_device_manager.cpp b/icd/api/vk_physical_device_manager.cpp index 8e16d303..eb308126 100644 --- a/icd/api/vk_physical_device_manager.cpp +++ b/icd/api/vk_physical_device_manager.cpp @@ -39,6 +39,7 @@ #include "palVectorImpl.h" #include #include +#include "settings/experimentsLoader.h" namespace vk { @@ -51,7 +52,8 @@ PhysicalDeviceManager::PhysicalDeviceManager( m_pInstance(pInstance), m_pDisplayManager(pDisplayManager), m_devices(pInstance->Allocator()), - m_pAllNullProperties(nullptr) + m_pAllNullProperties(nullptr), + m_pExperimentsLoader(nullptr) { } @@ -107,7 +109,27 @@ VkResult PhysicalDeviceManager::Create( // ===================================================================================================================== VkResult PhysicalDeviceManager::Initialize() { - return UpdateLockedPhysicalDeviceList(); + VkResult result = VK_SUCCESS; + + void* pExpLoader = m_pInstance->AllocMem(sizeof(ExperimentsLoader), VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + + if (pExpLoader != nullptr) + { + m_pExperimentsLoader = VK_PLACEMENT_NEW(pExpLoader) ExperimentsLoader(m_pInstance->PalPlatform()); + + result = PalToVkResult(m_pExperimentsLoader->Init()); + } + else + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + + if (result == VK_SUCCESS) + { + result = UpdateLockedPhysicalDeviceList(); + } + + return result; } // ===================================================================================================================== @@ -118,6 +140,12 @@ PhysicalDeviceManager::~PhysicalDeviceManager() m_pInstance->FreeMem(m_pAllNullProperties); } + if (m_pExperimentsLoader != nullptr) + { + m_pExperimentsLoader->~ExperimentsLoader(); + m_pInstance->FreeMem(m_pExperimentsLoader); + } + DestroyLockedPhysicalDeviceList(); } @@ -274,7 +302,8 @@ VkResult PhysicalDeviceManager::UpdateLockedPhysicalDeviceList(void) if (pLoader != nullptr) { settingsArray[i] = VK_PLACEMENT_NEW(pLoader) VulkanSettingsLoader(pPalDeviceList[i], - m_pInstance->PalPlatform()); + m_pInstance->PalPlatform(), + m_pExperimentsLoader); } else { diff --git a/icd/api/vk_pipeline.cpp b/icd/api/vk_pipeline.cpp index 862380fe..0a793d0b 100644 --- a/icd/api/vk_pipeline.cpp +++ b/icd/api/vk_pipeline.cpp @@ -280,11 +280,11 @@ VkResult Pipeline::BuildShaderStageInfo( uint32_t maxOutIdx = 0; - for (uint32_t i = 0; i < stageCount; ++i) + for (uint32_t stageIdx = 0; stageIdx < stageCount; ++stageIdx) { - const VkPipelineShaderStageCreateInfo& stageInfo = pStages[i]; + const VkPipelineShaderStageCreateInfo& stageInfo = pStages[stageIdx]; const ShaderStage stage = ShaderFlagBitToStage(stageInfo.stage); - const uint32_t outIdx = pfnGetOutputIdx(i, stage); + const uint32_t outIdx = pfnGetOutputIdx(stageIdx, stage); maxOutIdx = Util::Max(maxOutIdx, outIdx + 1); @@ -335,28 +335,30 @@ VkResult Pipeline::BuildShaderStageInfo( if (pShaderModuleCreateInfo != nullptr) { - flags = pShaderModuleCreateInfo->flags; + flags = pShaderModuleCreateInfo->flags; shaderBinary.codeSize = pShaderModuleCreateInfo->codeSize; shaderBinary.pCode = pShaderModuleCreateInfo->pCode; codeHash = ShaderModule::BuildCodeHash( shaderBinary.pCode, shaderBinary.codeSize); - } - if (shaderBinary.pCode != nullptr) - { - result = pCompiler->BuildShaderModule( - pDevice, - flags, - VK_INTERNAL_SHADER_FLAGS_FORCE_UNCACHED_BIT, - shaderBinary, - &pTempModules[outIdx]); - - pShaderStageInfo[outIdx].pModuleHandle = &pTempModules[outIdx]; - pShaderStageInfo[outIdx].codeSize = shaderBinary.codeSize; + if (shaderBinary.pCode != nullptr) + { + result = pCompiler->BuildShaderModule( + pDevice, + flags, + VK_INTERNAL_SHADER_FLAGS_FORCE_UNCACHED_BIT, + shaderBinary, + &pTempModules[outIdx]); + + pTempModules[outIdx].codeHash = codeHash; + pShaderStageInfo[outIdx].pModuleHandle = &pTempModules[outIdx]; + pShaderStageInfo[outIdx].codeSize = shaderBinary.codeSize; + } } - else if (pPipelineShaderStageModuleIdentifierCreateInfoEXT != nullptr) + + if (pPipelineShaderStageModuleIdentifierCreateInfoEXT != nullptr) { // Get the 128 bit ShaderModule Hash VK_ASSERT(pPipelineShaderStageModuleIdentifierCreateInfoEXT->identifierSize == @@ -377,11 +379,7 @@ VkResult Pipeline::BuildShaderStageInfo( break; } - pShaderStageInfo[outIdx].codeHash = ShaderModule::GetCodeHash(codeHash, stageInfo.pName); - if (pShaderStageInfo[outIdx].pModuleHandle == &pTempModules[outIdx]) - { - pTempModules[outIdx].codeHash = pShaderStageInfo[outIdx].codeHash; - } + pShaderStageInfo[outIdx].codeHash = ShaderModule::GetCodeHash(codeHash, stageInfo.pName); } pShaderStageInfo[outIdx].stage = stage; @@ -446,9 +444,9 @@ void Pipeline::HandleExtensionStructs( // ===================================================================================================================== Pipeline::Pipeline( - Device* const pDevice, + Device* const pDevice, #if VKI_RAY_TRACING - bool hasRayTracing, + bool hasRayTracing, #endif VkPipelineBindPoint type) : @@ -1412,6 +1410,24 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetPipelineExecutableInternalRepresentationsKHR return (*pInternalRepresentationCount < numberOfInternalRepresentations) ? VK_INCOMPLETE : VK_SUCCESS; } +// ===================================================================================================================== +VKAPI_ATTR VkDeviceAddress VKAPI_CALL vkGetPipelineIndirectDeviceAddressNV( + VkDevice device, + const VkPipelineIndirectDeviceAddressInfoNV* pInfo) +{ + VK_NOT_IMPLEMENTED; + return 0; +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkGetPipelineIndirectMemoryRequirementsNV( + VkDevice device, + const VkComputePipelineCreateInfo* pCreateInfo, + VkMemoryRequirements2* pMemoryRequirements) +{ + VK_NOT_IMPLEMENTED; +} + } // namespace entry } // namespace vk diff --git a/icd/api/vk_queue.cpp b/icd/api/vk_queue.cpp index 2925f676..71d28ffa 100644 --- a/icd/api/vk_queue.cpp +++ b/icd/api/vk_queue.cpp @@ -54,6 +54,9 @@ #include "sqtt/sqtt_layer.h" #include "palQueue.h" +#include "palVectorImpl.h" +#include "palListImpl.h" + namespace vk { @@ -79,10 +82,14 @@ Queue::Queue( m_queueFamilyIndex(queueFamilyIndex), m_queueIndex(queueIndex), m_queueFlags(queueFlags), - m_pDevModeMgr(pDevice->VkInstance()->GetDevModeMgr()), + m_pDevMode(pDevice->VkInstance()->GetDevModeMgr()), m_pStackAllocator(pStackAllocator), m_pCmdBufferRing(pCmdBufferRing), m_isDeviceIndependent(isDeviceIndependent) +#if VKI_RAY_TRACING + , m_pCpsGlobalMem(nullptr) + , m_cpsMemDestroyList(pDevice->VkInstance()->Allocator()) +#endif { if (ppPalQueues != nullptr) { @@ -881,6 +888,16 @@ Queue::~Queue() } } +#if VKI_RAY_TRACING + FreeRetiredCpsStackMem(); + VK_ASSERT(m_cpsMemDestroyList.NumElements() == 0); + + if (m_pCpsGlobalMem != nullptr) + { + m_pDevice->MemMgr()->FreeGpuMem(m_pCpsGlobalMem); + m_pCpsGlobalMem = nullptr; + } +#endif } // ===================================================================================================================== @@ -1073,9 +1090,9 @@ VkResult Queue::Submit( VkFence fence) { #if ICD_GPUOPEN_DEVMODE_BUILD - DevModeMgr* pDevModeMgr = m_pDevice->VkInstance()->GetDevModeMgr(); + IDevMode* pDevMode = m_pDevice->VkInstance()->GetDevModeMgr(); - bool timedQueueEvents = ((pDevModeMgr != nullptr) && pDevModeMgr->IsQueueTimingActive(m_pDevice)); + bool timedQueueEvents = ((pDevMode != nullptr) && pDevMode->IsQueueTimingActive(m_pDevice)); #else bool timedQueueEvents = false; #endif @@ -1087,6 +1104,10 @@ VkResult Queue::Submit( const bool isSynchronization2 = std::is_same::value; +#if VKI_RAY_TRACING + FreeRetiredCpsStackMem(); +#endif + // The fence should be only used in the last submission to PAL. The implicit ordering guarantees provided by PAL // make sure that the fence is only signaled when all submissions complete. if ((submitCount == 0) && (pFence != nullptr)) @@ -1168,7 +1189,7 @@ VkResult Queue::Submit( case VK_STRUCTURE_TYPE_FRAME_BOUNDARY_EXT: // Note: VK_EXT_frame_boundary is only intended for tools/debuggers // to be able to associate frame information with queue submissions. - DevModeFrameBoundary(pDevModeMgr, static_cast(pNext)); + DevModeFrameBoundary(pDevMode, static_cast(pNext)); break; default: @@ -1296,6 +1317,25 @@ VkResult Queue::Submit( ApiCmdBuffer* const * pCommandBuffers = reinterpret_cast(pCmdBuffers); +#if VKI_RAY_TRACING + uint64 maxCpsStackSize = 0; + Pal::IFence* pCpsMemFence = nullptr; + + for (uint32_t i = 0; i < cmdBufferCount; ++i) + { + const CmdBuffer& cmdBuf = *(*pCommandBuffers[i]); + + if (cmdBuf.GetCpsMemSize() > 0) + { + maxCpsStackSize = Util::Max(maxCpsStackSize, cmdBuf.GetCpsMemSize()); + } + } + + if (maxCpsStackSize > 0) + { + pCpsMemFence = GetCpsStackMem(deviceIdx, maxCpsStackSize); + } +#endif perSubQueueInfo.cmdBufferCount = 0; palSubmitInfo.stackSizeInDwords = 0; @@ -1320,6 +1360,11 @@ VkResult Queue::Submit( { pCmdBufInfos[i].isValid = true; pCmdBufInfos[i].rayTracingExecuted = true; + + if (m_pCpsGlobalMem != nullptr) + { + cmdBuf.ApplyPatchCpsRequests(deviceIdx, *m_pCpsGlobalMem->PalMemory(deviceIdx)); + } } #endif @@ -1402,11 +1447,23 @@ VkResult Queue::Submit( } } + Pal::IFence* iFence[2] = {nullptr, nullptr}; + palSubmitInfo.ppFences = iFence; + +#if VKI_RAY_TRACING + if (pCpsMemFence != nullptr) + { + iFence[0] = pCpsMemFence; + palSubmitInfo.fenceCount = 1; + } +#endif + if (lastBatch && (pFence != nullptr)) { pPalFence = pFence->PalFence(deviceIdx); - palSubmitInfo.ppFences = &pPalFence; - palSubmitInfo.fenceCount = 1; + + iFence[palSubmitInfo.fenceCount] = pPalFence; + palSubmitInfo.fenceCount++; pFence->SetActiveDevice(deviceIdx); } @@ -1483,7 +1540,9 @@ VkResult Queue::Submit( } else { - palResult = PalQueueSubmit(m_pDevice, PalQueue(deviceIdx), palSubmitInfo); + { + palResult = PalQueueSubmit(m_pDevice, PalQueue(deviceIdx), palSubmitInfo); + } } } else @@ -1492,7 +1551,7 @@ VkResult Queue::Submit( // TMZ is NOT supported for GPUOPEN path. VK_ASSERT((*pCommandBuffers[0])->IsProtected() == false); - palResult = m_pDevModeMgr->TimedQueueSubmit( + palResult = m_pDevMode->TimedQueueSubmit( deviceIdx, this, cmdBufferCount, @@ -1609,10 +1668,10 @@ VkResult Queue::PalSignalSemaphores( const uint32_t* pSemaphoreDeviceIndices) { #if ICD_GPUOPEN_DEVMODE_BUILD - DevModeMgr* pDevModeMgr = m_pDevice->VkInstance()->GetDevModeMgr(); + IDevMode* pDevMode = m_pDevice->VkInstance()->GetDevModeMgr(); - bool timedQueueEvents = ((pDevModeMgr != nullptr) && - pDevModeMgr->IsQueueTimingActive(m_pDevice)); + bool timedQueueEvents = ((pDevMode != nullptr) && + pDevMode->IsQueueTimingActive(m_pDevice)); #else bool timedQueueEvents = false; #endif @@ -1657,7 +1716,7 @@ VkResult Queue::PalSignalSemaphores( else { #if ICD_GPUOPEN_DEVMODE_BUILD - palResult = pDevModeMgr->TimedSignalQueueSemaphore(deviceIdx, this, pSemaphores[i], pointValue, + palResult = pDevMode->TimedSignalQueueSemaphore(deviceIdx, this, pSemaphores[i], pointValue, pPalSemaphore); #else VK_NEVER_CALLED(); @@ -1686,10 +1745,10 @@ VkResult Queue::PalWaitSemaphores( uint32_t deviceIdx = DefaultDeviceIndex; #if ICD_GPUOPEN_DEVMODE_BUILD - DevModeMgr* pDevModeMgr = m_pDevice->VkInstance()->GetDevModeMgr(); + IDevMode* pDevMode = m_pDevice->VkInstance()->GetDevModeMgr(); - bool timedQueueEvents = ((pDevModeMgr != nullptr) && - pDevModeMgr->IsQueueTimingActive(m_pDevice)); + bool timedQueueEvents = ((pDevMode != nullptr) && + pDevMode->IsQueueTimingActive(m_pDevice)); #else bool timedQueueEvents = false; #endif @@ -1736,7 +1795,7 @@ VkResult Queue::PalWaitSemaphores( else { #if ICD_GPUOPEN_DEVMODE_BUILD - palResult = pDevModeMgr->TimedWaitQueueSemaphore(deviceIdx, this, pSemaphores[i], pointValue, + palResult = pDevMode->TimedWaitQueueSemaphore(deviceIdx, this, pSemaphores[i], pointValue, pPalSemaphore); #else VK_NEVER_CALLED(); @@ -1759,7 +1818,7 @@ VkResult Queue::UpdateFlipStatus( { bool isOwner = false; Pal::IDevice* pPalDevice = m_pDevice->PalDevice(DefaultDeviceIndex); - uint32_t vidPnSourceId = pSwapChain->GetFullscreenMgr()->GetVidPnSourceId(); + uint32_t vidPnSourceId = pSwapChain->GetVidPnSourceId(); Pal::Result palResult = pPalDevice->GetFlipStatus(vidPnSourceId, &m_flipStatus.flipFlags, &isOwner); if (palResult == Pal::Result::Success) @@ -1805,9 +1864,9 @@ VkResult Queue::Present( if (m_pDevice->VkInstance()->GetDevModeMgr() != nullptr) { m_pDevice->VkInstance()->GetDevModeMgr()->NotifyFrameEnd(this, - DevModeMgr::FrameDelimiterType::QueuePresent); + IDevMode::FrameDelimiterType::QueuePresent); m_pDevice->VkInstance()->GetDevModeMgr()->NotifyFrameBegin(this, - DevModeMgr::FrameDelimiterType::QueuePresent); + IDevMode::FrameDelimiterType::QueuePresent); } #endif return VK_ERROR_INITIALIZATION_FAILED; @@ -1943,7 +2002,7 @@ VkResult Queue::Present( if (pSwapChain->GetFullscreenMgr() != nullptr) { Pal::Result palResult = m_pDevice->PalDevice(DefaultDeviceIndex)->PollFullScreenFrameMetadataControl( - pSwapChain->GetFullscreenMgr()->GetVidPnSourceId(), + pSwapChain->GetVidPnSourceId(), &m_palFrameMetadataControl); VK_ASSERT(palResult == Pal::Result::Success); @@ -1973,7 +2032,7 @@ VkResult Queue::Present( if (m_pDevice->VkInstance()->GetDevModeMgr() != nullptr) { m_pDevice->VkInstance()->GetDevModeMgr()->NotifyFrameEnd(this, - DevModeMgr::FrameDelimiterType::QueuePresent); + IDevMode::FrameDelimiterType::QueuePresent); } #endif @@ -2017,7 +2076,7 @@ VkResult Queue::Present( if (m_pDevice->VkInstance()->GetDevModeMgr() != nullptr) { m_pDevice->VkInstance()->GetDevModeMgr()->NotifyFrameBegin(this, - DevModeMgr::FrameDelimiterType::QueuePresent); + IDevMode::FrameDelimiterType::QueuePresent); } #endif @@ -2603,9 +2662,7 @@ bool Queue::BuildPostProcessCommands( frameInfo.pSrcImage = pPresentInfo->pSrcImage; frameInfo.debugOverlay.presentMode = pPresentInfo->presentMode; frameInfo.debugOverlay.wsiPlatform = displayableInfo.palPlatform; - frameInfo.debugOverlay.presentKey = pSwapChain->IsDxgiEnabled() - ? Pal::PresentKeyFromPointer(pPresentInfo->pSwapChain) - : Pal::PresentKeyFromOsWindowHandle(displayableInfo.windowHandle); + frameInfo.debugOverlay.presentKey = Pal::PresentKeyFromOsWindowHandle(displayableInfo.windowHandle); } else { @@ -2645,42 +2702,6 @@ VkResult Queue::SubmitInternalCmdBuf( return pRing->SubmitCmdBuffer(m_pDevice, deviceIdx, m_pPalQueues[deviceIdx], cmdBufInfo, pCmdBufState); } -// ===================================================================================================================== -// Synchronize back buffer memory by doing a dummy submit with the written primary field set. -VkResult Queue::SynchronizeBackBuffer( - Memory* pMemory, - uint32_t deviceIdx) -{ - VkResult result = VK_SUCCESS; - - if (m_pDummyCmdBuffer[deviceIdx] == nullptr) - { - result = CreateDummyCmdBuffer(); - } - - if (result == VK_SUCCESS) - { - Pal::IGpuMemory* pGpuMem = pMemory->PalMemory(deviceIdx); - - Pal::PerSubQueueSubmitInfo perSubQueueInfo = {}; - - perSubQueueInfo.cmdBufferCount = 1; - perSubQueueInfo.ppCmdBuffers = &m_pDummyCmdBuffer[deviceIdx]; - perSubQueueInfo.pCmdBufInfoList = nullptr; - - Pal::SubmitInfo submitInfo = {}; - - submitInfo.pPerSubQueueInfo = &perSubQueueInfo; - submitInfo.perSubQueueInfoCount = 1; - submitInfo.blockIfFlippingCount = 1; - submitInfo.ppBlockIfFlipping = &pGpuMem; - - result = PalToVkResult(PalQueueSubmit(m_pDevice, m_pPalQueues[deviceIdx], submitInfo)); - } - - return result; -} - VkResult Queue::CreateSqttState( void* pMemory) { @@ -2701,7 +2722,7 @@ void Queue::InsertDebugUtilsLabel( #if ICD_GPUOPEN_DEVMODE_BUILD if (m_pDevice->VkInstance()->GetDevModeMgr() != nullptr) { - m_pDevice->VkInstance()->GetDevModeMgr()->NotifyFrameEnd(this, DevModeMgr::FrameDelimiterType::QueueLabel); + m_pDevice->VkInstance()->GetDevModeMgr()->NotifyFrameEnd(this, IDevMode::FrameDelimiterType::QueueLabel); if (settings.devModeBlockingEndFrameDebugUtils) { @@ -2720,7 +2741,7 @@ void Queue::InsertDebugUtilsLabel( #if ICD_GPUOPEN_DEVMODE_BUILD if (m_pDevice->VkInstance()->GetDevModeMgr() != nullptr) { - m_pDevice->VkInstance()->GetDevModeMgr()->NotifyFrameBegin(this, DevModeMgr::FrameDelimiterType::QueueLabel); + m_pDevice->VkInstance()->GetDevModeMgr()->NotifyFrameBegin(this, IDevMode::FrameDelimiterType::QueueLabel); } #endif } @@ -2730,24 +2751,138 @@ void Queue::InsertDebugUtilsLabel( // Notifies the trace tool about frame boundary, which is mainly used for unconventional vkQueuePresent-less // rendering or compute-only applications. void Queue::DevModeFrameBoundary( - DevModeMgr* pDevModeMgr, + IDevMode* pDevMode, const VkFrameBoundaryEXT* pFrameBoundaryInfo) { #if ICD_GPUOPEN_DEVMODE_BUILD - if ((pDevModeMgr != nullptr) && + if ((pDevMode != nullptr) && (pFrameBoundaryInfo != nullptr)) { if ((pFrameBoundaryInfo->flags & VK_FRAME_BOUNDARY_FRAME_END_BIT_EXT) != 0) { - pDevModeMgr->NotifyFrameEnd(this, - DevModeMgr::FrameDelimiterType::QueuePresent); - pDevModeMgr->NotifyFrameBegin(this, - DevModeMgr::FrameDelimiterType::QueuePresent); + pDevMode->NotifyFrameEnd(this, + IDevMode::FrameDelimiterType::QueuePresent); + pDevMode->NotifyFrameBegin(this, + IDevMode::FrameDelimiterType::QueuePresent); } } #endif } +#if VKI_RAY_TRACING +// ===================================================================================================================== +// Check the allocations in m_cpsMemDestroyList, free the retired ones. +void Queue::FreeRetiredCpsStackMem() +{ + if (m_cpsMemDestroyList.NumElements() > 0) + { + for (CpsMemDestroyListIterator iter = m_cpsMemDestroyList.Begin(); iter.Get() != nullptr; ) + { + CpsMemTracker* pTracker = iter.Get(); + if (pTracker->pFence->GetStatus() == Pal::Result::Success) + { + m_pDevice->MemMgr()->FreeGpuMem(pTracker->pMem); + pTracker->pFence->Destroy(); + m_pDevice->VkInstance()->FreeMem(pTracker->pFence); + + // implicitly preceed the iterator to next node. + m_cpsMemDestroyList.Erase(&iter); + } + else + { + break; + } + } + } +} + +// ===================================================================================================================== +// Get Cps global memory. +// - Allocate if it does not exist. +// - Reallocate m_pCpsGlobalMem from X To Y if its size is not big enough. X is put into m_cpsMemDestroyList to be freed +// later. A fence is generated and passed in the submission to Pal. When it is signaled, X is freed. Note it is +// signaled when the first cmdbuf switching to Y is done, so not optimal regarding memory footprint. Ideally it can be +// signalled when X is retired, but that means every submission referencing X has to signal an extra IFence even +// m_pCpsGlobalMem stays unchanged. The reason is we dont know if the next submission will require a bigger cps stack +// memory. +Pal::IFence* Queue::GetCpsStackMem( + uint32_t deviceIdx, + uint64_t size) +{ + VK_ASSERT(m_pDevice->GetRuntimeSettings().cpsFlags & CpsFlagStackInGlobalMem); + + Pal::IFence* pFence = nullptr; + GpuRt::IDevice* pRtDevice = m_pDevice->RayTrace()->GpuRt(deviceIdx); + + //TODO: cap the size to a reasonable preset + if ((m_pCpsGlobalMem == nullptr) || (m_pCpsGlobalMem->Size() < size)) + { + InternalMemory* pCpsVidMem = nullptr; + + InternalMemCreateInfo allocInfo = {}; + m_pDevice->MemMgr()->GetCommonPool(InternalPoolGpuAccess, &allocInfo); + + m_pDevice->MemMgr()->AllocGpuMem(allocInfo, pCpsVidMem, 0, VK_OBJECT_TYPE_QUEUE, 0); + VK_ASSERT(pCpsVidMem != nullptr); + + Pal::Result palResult = Pal::Result::Success; + + if (m_pCpsGlobalMem == nullptr) // first alloc + { + m_pCpsGlobalMem = pCpsVidMem; + } + else if (pCpsVidMem != nullptr) + { + Pal::IDevice* pPalDevice = m_pDevice->PalDevice(deviceIdx); + + const size_t palFenceSize = pPalDevice->GetFenceSize(&palResult); + VK_ASSERT(palResult == Pal::Result::Success); + + void* pPalMemory = m_pDevice->VkInstance()->AllocMem(palFenceSize, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + Pal::FenceCreateInfo fenceInfo = {}; + fenceInfo.flags.signaled = 0; + + if (pPalMemory != nullptr) + { + palResult = pPalDevice->CreateFence(fenceInfo, pPalMemory, &pFence); + + if (palResult == Pal::Result::Success) + { + CpsMemTracker tracker = { m_pCpsGlobalMem, pFence }; + m_cpsMemDestroyList.PushBack(tracker); + m_pCpsGlobalMem = pCpsVidMem; + } + else + { + VK_ASSERT(pFence == nullptr); + m_pDevice->VkInstance()->FreeMem(pPalMemory); + } + } + else + { + palResult = Pal::Result::ErrorOutOfMemory; + } + + if (palResult != Pal::Result::Success) + { + // Have to bear with the original allocation, expecting performance hit + m_pDevice->MemMgr()->FreeGpuMem(pCpsVidMem); + } + } + + // Initialize CPS Memory + if (palResult == Pal::Result::Success) + { + palResult = pRtDevice->InitializeCpsMemory(*m_pCpsGlobalMem->PalMemory(deviceIdx), size); + VK_ASSERT(palResult == Pal::Result::Success); + } + } + + return pFence; +} +#endif + /** *********************************************************************************************************************** * C-Callable entry points start here. These entries go in the dispatch table(s). diff --git a/icd/api/vk_semaphore.cpp b/icd/api/vk_semaphore.cpp index daae49f5..4223459a 100644 --- a/icd/api/vk_semaphore.cpp +++ b/icd/api/vk_semaphore.cpp @@ -481,7 +481,7 @@ VkResult Semaphore::WaitSemaphoreValue( VK_ASSERT(pSemaphore->IsTimelineSemaphore()); pPalSemaphore = pSemaphore->PalSemaphore(DefaultDeviceIndex); pSemaphore->RestoreSemaphore(); - palResult = pPalSemaphore->WaitSemaphoreValue(value, timeout); + palResult = pPalSemaphore->WaitSemaphoreValue(value, Uint64ToChronoNano(timeout)); } return PalToVkResult(palResult); diff --git a/icd/api/vk_swapchain.cpp b/icd/api/vk_swapchain.cpp index 5d8f2d6e..5436a013 100644 --- a/icd/api/vk_swapchain.cpp +++ b/icd/api/vk_swapchain.cpp @@ -67,6 +67,7 @@ SwapChain::SwapChain( const Properties& properties, VkPresentModeKHR presentMode, FullscreenMgr* pFullscreenMgr, + uint32_t vidPnSourceId, Pal::WorkstationStereoMode wsStereoMode, Pal::ISwapChain* pPalSwapChain) : @@ -81,7 +82,8 @@ SwapChain::SwapChain( m_presentCount(0), m_presentMode(presentMode), m_deprecated(false) - , m_wsStereoMode(wsStereoMode) + , m_vidPnSourceId(vidPnSourceId), + m_wsStereoMode(wsStereoMode) { // Initialize the color gamut with the native values. if (m_pFullscreenMgr != nullptr) @@ -443,8 +445,7 @@ VkResult SwapChain::Create( mode, pScreen, screenProperties.hDisplay, - swapChainCreateInfo.hWindow, - screenProperties.vidPnSourceId); + swapChainCreateInfo.hWindow); } } @@ -529,6 +530,7 @@ VkResult SwapChain::Create( properties, pCreateInfo->presentMode, pFullscreenMgr, + screenProperties.vidPnSourceId, wsStereoMode, pPalSwapChain); @@ -584,6 +586,7 @@ void SwapChain::Init(const VkAllocationCallbacks* pAllocator) { result = SetupAutoStereo(pAllocator); } + } // ===================================================================================================================== @@ -765,7 +768,7 @@ VkResult SwapChain::AcquireNextImage( if (result == VK_SUCCESS) { - acquireInfo.timeout = timeout; + acquireInfo.timeout = Uint64ToChronoNano(timeout); acquireInfo.pSemaphore = (pSemaphore != nullptr) ? pSemaphore->PalSemaphore(DefaultDeviceIndex) : nullptr; @@ -873,6 +876,14 @@ VkResult SwapChain::GetSwapchainImagesKHR( return result; } +// ===================================================================================================================== +bool SwapChain::IsFullscreenOrEfsePresent() const +{ + return ( + ((m_pFullscreenMgr != nullptr) && m_pFullscreenMgr->GetExclusiveModeFlags().acquired) + ); +} + // ===================================================================================================================== // Fills in the PAL swap chain present info with the appropriate image to present and returns its GPU memory. Pal::IGpuMemory* SwapChain::UpdatePresentInfo( @@ -896,11 +907,16 @@ Pal::IGpuMemory* SwapChain::UpdatePresentInfo( // Let the fullscreen manager perform any fullscreen ownership transitions and override some of this present // information in case it has enabled fullscreen. - if (m_pFullscreenMgr != nullptr) + if ((m_pFullscreenMgr != nullptr) + ) { - m_pFullscreenMgr->UpdatePresentInfo(this, pPresentInfo, flipFlags); + m_pFullscreenMgr->TryEnterExclusive(this); } + // Always fallback to windowed if FSE is not acquired to avoid missing presents. + pPresentInfo->presentMode = + IsFullscreenOrEfsePresent() ? Pal::PresentMode::Fullscreen : Pal::PresentMode::Windowed; + return pSrcImageGpuMemory; } @@ -1177,8 +1193,7 @@ FullscreenMgr::FullscreenMgr( FullscreenMgr::Mode mode, Pal::IScreen* pScreen, Pal::OsDisplayHandle hDisplay, - Pal::OsWindowHandle hWindow, - uint32_t vidPnSourceId) + Pal::OsWindowHandle hWindow) : m_pDevice{pDevice}, m_exclusiveModeFlags{}, @@ -1187,7 +1202,6 @@ FullscreenMgr::FullscreenMgr( m_fullscreenPresentSuccessCount{0}, m_hDisplay{hDisplay}, m_hWindow{hWindow}, - m_vidPnSourceId{vidPnSourceId}, m_mode{mode} { VK_ASSERT(m_pScreen != nullptr); @@ -1199,6 +1213,7 @@ FullscreenMgr::FullscreenMgr( bool FullscreenMgr::TryEnterExclusive( SwapChain* pSwapChain) { + // If we are not perma-disabled if (m_exclusiveModeFlags.disabled == 0) { @@ -1270,8 +1285,6 @@ bool FullscreenMgr::TryExitExclusive( if (m_pScreen != nullptr) { Pal::Result palResult = m_pScreen->ReleaseFullscreenOwnership(); - - VK_ASSERT((m_exclusiveModeFlags.acquired == 0) || (palResult == Pal::Result::Success)); } m_exclusiveModeFlags.acquired = 0; @@ -1500,7 +1513,7 @@ void FullscreenMgr::PostPresent( // DXGI fullscreen is OS controlled and may go in and out of fullscreen mode to deal with user interaction, // display toasts etc. Ignore reporting fullscreen errors on this platform. if ((m_exclusiveModeFlags.acquired == 0) && (m_exclusiveModeFlags.mismatchedDisplayMode == 0) && - (m_mode == Mode::Explicit) && (pSwapChain->IsDxgiEnabled() == false)) + (m_mode == Mode::Explicit)) { *pPresentResult = Pal::Result::ErrorFullscreenUnavailable; } @@ -1512,34 +1525,6 @@ void FullscreenMgr::PostPresent( } } -// ===================================================================================================================== -// This function potentially overrides normal swap chain present info by replacing a windowed present with a page- -// flipped fullscreen present. -// -// This can only happen if the screen is currently compatible with fullscreen presents and we have successfully -// acquired exclusive access to the screen. -void FullscreenMgr::UpdatePresentInfo( - SwapChain* pSwapChain, - Pal::PresentSwapChainInfo* pPresentInfo, - const Pal::FlipStatusFlags& flipFlags) -{ - // Present mode does not matter in DXGI as it is completely OS handled. This is for our internal tracking only - if (pSwapChain->IsDxgiEnabled()) - { - // If KMD reported we're in Indpendent Flip we can assume that DXGI acquired FSE. - pPresentInfo->presentMode = flipFlags.iFlip ? Pal::PresentMode::Fullscreen : Pal::PresentMode::Windowed; - } - // Try to enter (or remain in) exclusive access mode on this swap chain's screen for this present - else - { - TryEnterExclusive(pSwapChain); - - // Always fallback to windowed if FSE is not acquired to avoid missing presents. - pPresentInfo->presentMode = - m_exclusiveModeFlags.acquired ? Pal::PresentMode::Fullscreen : Pal::PresentMode::Windowed; - } -} - // ===================================================================================================================== // This function determines whether it's safe to acquire full screen exclusive or not. Pal::Result FullscreenMgr::IsFullscreenOwnershipSafe() const diff --git a/icd/api/vk_utils.cpp b/icd/api/vk_utils.cpp index a99dd888..45605b97 100644 --- a/icd/api/vk_utils.cpp +++ b/icd/api/vk_utils.cpp @@ -44,7 +44,6 @@ uint32_t GetBuildTimeHash() return Util::HashLiteralString(__DATE__ __TIME__); } -#if DEBUG // ===================================================================================================================== // If turned on and exe name is a match, this function spins idle until we have a debugger hooked. void WaitIdleForDebugger( @@ -81,7 +80,6 @@ void WaitIdleForDebugger( } } } -#endif } // namespace utils diff --git a/icd/res/ver.h b/icd/res/ver.h index 8f26df38..255caefd 100644 --- a/icd/res/ver.h +++ b/icd/res/ver.h @@ -36,7 +36,7 @@ #define VERSION_MAJOR_STR MAKE_VERSION_STRING(VULKAN_ICD_MAJOR_VERSION) "\0" // Bump up after each promotion to mainline -#define VULKAN_ICD_BUILD_VERSION 301 +#define VULKAN_ICD_BUILD_VERSION 304 // String version is needed with leading zeros and extra termination (unicode) #define VERSION_NUMBER_MINOR VULKAN_ICD_BUILD_VERSION @@ -45,7 +45,7 @@ // These values specify the driver ID and driver info string #define VULKAN_DRIVER_ID VK_DRIVER_ID_AMD_OPEN_SOURCE_KHR // "AMDOPEN" #define VULKAN_DRIVER_NAME_STR "AMD open-source driver" -#define VULKAN_DRIVER_INFO_STR "2024.Q1.3" +#define VULKAN_DRIVER_INFO_STR "2024.Q2.1" #define VULKAN_DRIVER_INFO_STR_LLPC "(LLPC)" // These values tell which version of the conformance test the driver is compliant against diff --git a/icd/settings/experimentsLoader.cpp b/icd/settings/experimentsLoader.cpp new file mode 100644 index 00000000..3dc66fe8 --- /dev/null +++ b/icd/settings/experimentsLoader.cpp @@ -0,0 +1,71 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#include "experimentsLoader.h" +#include "dd_settings_service.h" +#include "palPlatform.h" + +namespace vk +{ +// ===================================================================================================================== +ExperimentsLoader::ExperimentsLoader( + Pal::IPlatform* pPlatform) + : + DevDriver::SettingsBase(&m_settings, sizeof(m_settings)), + m_pPlatform(pPlatform) +{ + +} + +// ===================================================================================================================== +ExperimentsLoader::~ExperimentsLoader() +{ +} + +// ===================================================================================================================== +void ExperimentsLoader::Destroy() +{ +} + +// ===================================================================================================================== +Pal::Result ExperimentsLoader::Init() +{ + Pal::Result palResult = Pal::Result::Unsupported; + DD_RESULT result = SetupDefaultsAndPopulateMap(); + if (result == DD_RESULT_SUCCESS) + { + DevDriver::SettingsRpcService* pSettingsRpcService = m_pPlatform->GetSettingsRpcService(); + if (pSettingsRpcService) + { + pSettingsRpcService->RegisterSettingsComponent(this); + } + + palResult = Pal::Result::Success; + } + + return palResult; +} + +} diff --git a/icd/settings/experimentsLoader.h b/icd/settings/experimentsLoader.h new file mode 100644 index 00000000..7924f334 --- /dev/null +++ b/icd/settings/experimentsLoader.h @@ -0,0 +1,79 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "dd_settings_base.h" +#include "pal.h" + +#include "settings/g_experiments.h" + +// Forward declarations +namespace Pal +{ +class IPlatform; +} + +namespace vk +{ +// ===================================================================================================================== +// This class is responsible for loading the ExpSettings structure specified in the +// constructor. This is a helper class that only exists for a short time while the settings +// are initialized. +class ExperimentsLoader final : public DevDriver::SettingsBase +{ +public: + explicit ExperimentsLoader(Pal::IPlatform* pPlatform); + + virtual ~ExperimentsLoader(); + + Pal::Result Init(); + + void Destroy(); + + // Returns a const pointer to the settings struct + const ExpSettings* GetExpSettings() const { return &m_settings; } + // Returns a non-const pointer to the settings struct, should only be used when the settings will be modified + ExpSettings* GetMutableExpSettings() { return &m_settings; } + + // Auto-generated + uint64_t GetSettingsBlobHash() const override; + + void ReportVsyncState(ExpVSyncControl state) { m_settings.expVerticalSynchronization = state; } + +private: + + PAL_DISALLOW_COPY_AND_ASSIGN(ExperimentsLoader); + PAL_DISALLOW_DEFAULT_CTOR(ExperimentsLoader); + + // Auto-generated functions + virtual const char* GetComponentName() const override; + virtual DD_RESULT SetupDefaultsAndPopulateMap() override; + + Pal::IPlatform* m_pPlatform; + ExpSettings m_settings; +}; + +} diff --git a/icd/settings/experiments_settings_xgl.json b/icd/settings/experiments_settings_xgl.json new file mode 100644 index 00000000..79466356 --- /dev/null +++ b/icd/settings/experiments_settings_xgl.json @@ -0,0 +1,287 @@ +{ + "Version": 1, + "ComponentName": "Experiments", + "Enums": [ + { + "Name": "ExpShaderWaveSize", + "Values": [ + { + "Name": "ExpWaveSizeAuto", + "Value": 0, + "Description": "Select automatically" + }, + { + "Name": "ExpWaveSizeWave64", + "Value": 2, + "Description": "Force 64 threads per wave" + }, + { + "Name": "ExpWaveSizeWave32", + "Value": 3, + "Description": "Force 32 threads per wave" + }, + { + "Name": "ExpWaveSizeInvalid", + "Value": 4, + "Description": "Invalid Wave Size" + } + ] + } + ], + "Settings": [ + { + "Description": "Disable mesh shader support as reported by graphics API.", + "Tags": [ + "Feature" + ], + "Type": "bool", + "Name": "ExpMeshShaderSupport", + "ExperimentName": "Disable Mesh Shader Support" + }, + { + "Description": "Disable support for ray tracing as reported by graphics API.", + "Tags": [ + "Feature" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Type": "bool", + "Name": "ExpRayTracingSupport", + "ExperimentName": "Disable Ray Tracing Support" + }, + { + "Description": "Disable support for variable rate shading as reported by graphics API.", + "Tags": [ + "Feature" + ], + "Type": "bool", + "Name": "ExpVariableRateShadingSupport", + "ExperimentName": "Disable Variable Rate Shading" + }, + { + "Description": "Disable support for native 16-bit types in shaders.", + "Tags": [ + "Feature" + ], + "Type": "bool", + "Name": "ExpNative16BitTypesSupport", + "ExperimentName": "Disable native 16-bit types support" + }, + { + "Description": "Disable support for custom AMD extensions, as offered by AMD GPU Services library in DX12 and VK_AMD_* extensions in Vulkan.", + "Tags": [ + "Feature" + ], + "Type": "bool", + "Name": "ExpAmdVendorExtensions", + "ExperimentName": "Disable AMD vendor extensions support" + }, + { + "Description": "Disable asynchronous compute queues. When disabled, Vulkan doesn't expose additional compute queues.", + "Tags": [ + "Feature" + ], + "Type": "bool", + "Name": "ExpComputeQueueSupport", + "ExperimentName": "Disable compute queue support" + }, + { + "Description": "Disable barrier optimizations.", + "Tags": [ + "Optimization" + ], + "Type": "bool", + "Name": "ExpBarrierOptimizations", + "ExperimentName": "Disable barrier optimizations" + }, + { + "Description": "Disable miscellaneous shader compiler optimizations.", + "Tags": [ + "Optimization" + ], + "Type": "bool", + "Name": "ExpShaderCompilerOptimizations", + "ExperimentName": "Disable shader compiler optimizations" + }, + { + "Description": "Disable optimizations applied when building ray tracing acceleration structures.", + "Tags": [ + "Optimization" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Type": "bool", + "Name": "ExpAccelStructureOpt", + "ExperimentName": "Disable acceleration structure optimizations" + }, + { + "Description": "Force specific wave (subgroup) size in all vertex shaders where possible.", + "Tags": [ + "Optimization" + ], + "ValidValues": { + "Name": "ExpShaderWaveSize" + }, + "Type": "enum", + "Name": "ExpVsWaveSize", + "ExperimentName": "Vertex shader wave size" + }, + { + "Description": "Force specific wave (subgroup) size in all tess control shaders where possible.", + "Tags": [ + "Optimization" + ], + "ValidValues": { + "Name": "ExpShaderWaveSize" + }, + "Type": "enum", + "Name": "ExpTcsWaveSize", + "ExperimentName": "Tess control shader wave size" + }, + { + "Description": "Force specific wave (subgroup) size in all tess eval shaders where possible.", + "Tags": [ + "Optimization" + ], + "ValidValues": { + "Name": "ExpShaderWaveSize" + }, + "Type": "enum", + "Name": "ExpTesWaveSize", + "ExperimentName": "Tess eval shader wave size" + }, + { + "Description": "Force specific wave (subgroup) size in all geometry shaders where possible.", + "Tags": [ + "Optimization" + ], + "ValidValues": { + "Name": "ExpShaderWaveSize" + }, + "Type": "enum", + "Name": "ExpGsWaveSize", + "ExperimentName": "Geometry shader wave size" + }, + { + "Description": "Force specific wave (subgroup) size in all fragment shaders where possible.", + "Tags": [ + "Optimization" + ], + "ValidValues": { + "Name": "ExpShaderWaveSize" + }, + "Type": "enum", + "Name": "ExpFsWaveSize", + "ExperimentName": "Fragment shader wave size" + }, + { + "Description": "Force specific wave (subgroup) size in all compute shaders where possible.", + "Tags": [ + "Optimization" + ], + "ValidValues": { + "Name": "ExpShaderWaveSize" + }, + "Type": "enum", + "Name": "ExpCsWaveSize", + "ExperimentName": "Compute shader wave size" + }, + { + "Description": "Force specific wave (subgroup) size in all mesh shaders where possible.", + "Tags": [ + "Optimization" + ], + "ValidValues": { + "Name": "ExpShaderWaveSize" + }, + "Type": "enum", + "Name": "ExpMsWaveSize", + "ExperimentName": "Mesh shader wave size" + }, + { + "Description": "Disables Ray tracing shader inlining", + "Tags": [ + "Optimization" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Type": "bool", + "Name": "ExpRayTracingPipelineCompilationMode", + "ExperimentName": "Disable Ray tracing shader inlining" + }, + { + "Description": "Disable shader cache.", + "Tags": [ + "Optimization" + ], + "Type": "bool", + "Name": "ExpShaderCache", + "ExperimentName": "Disable shader cache" + }, + { + "Description": "Disable texture color compression.", + "Tags": [ + "Safety" + ], + "Type": "bool", + "Name": "ExpTextureColorCompression", + "ExperimentName": "Disable Texture Color Compression" + }, + { + "Description": "Zero unbound descriptors.", + "Tags": [ + "Safety" + ], + "Type": "bool", + "Name": "ExpZeroUnboundDescriptors", + "ExperimentName": "Zero unbound descriptors" + }, + { + "Description": "Make command allocators thread safe.", + "Tags": [ + "Safety" + ], + "Type": "bool", + "Name": "ExpThreadSafeCommandAllocator", + "ExperimentName": "Thread-safe command allocator" + }, + { + "Description": "Enable / disable vertical synchronization.", + "Tags": [ + "Safety" + ], + "ValidValues": { + "IsEnum": true, + "Name": "ExpVSyncControl", + "Values": [ + { + "Name": "ExpVSyncControlAlwaysOff", + "Value": 0, + "Description": "Force Vsync Off." + }, + { + "Name": "ExpVSyncControlAlwaysOn", + "Value": 1, + "Description": "Force Vsync On." + }, + { + "Name": "ExpVSyncControlInvalid", + "Value": 2, + "Description": "Invalid value." + } + ] + }, + "Type": "enum", + "Name": "ExpVerticalSynchronization", + "ExperimentName": "Enable / disable vertical synchronization" + } + ], + "Tags": [ + "Feature", + "Optimization", + "Safety" + ] +} \ No newline at end of file diff --git a/icd/settings/settings.cpp b/icd/settings/settings.cpp index 441b2528..8679fb01 100644 --- a/icd/settings/settings.cpp +++ b/icd/settings/settings.cpp @@ -32,6 +32,7 @@ #include "include/vk_utils.h" #include "settings/settings.h" #include "vkgcDefs.h" +#include "settings/g_experiments.h" #include "palFile.h" #include "palHashMapImpl.h" @@ -43,6 +44,7 @@ #include "devDriverServer.h" #include "protocols/ddSettingsService.h" #include "dd_settings_service.h" +#include "experimentsLoader.h" #include "../layers/include/query_dlist.h" @@ -54,18 +56,78 @@ using namespace DevDriver::SettingsURIService; using namespace Util; +#define PAL_SET_VAL_IF_EXPERIMENT_ENABLED(opt, var, val) if (pExpSettings->exp##opt.ValueOr(false)) \ +{ \ + pPalSettings->var = val; \ +} + +#define VK_SET_VAL_IF_EXPERIMENT_ENABLED(opt, var, val) if (pExpSettings->exp##opt.ValueOr(false)) \ +{ \ + m_settings.var = val; \ +} + namespace vk { +// ===================================================================================================================== +static uint32_t ExpSwsToXglSws( + ExpShaderWaveSize wsIn) +{ + uint32_t wsOut = WaveSizeAuto; + switch (wsIn) + { + case ExpWaveSizeWave64: + wsOut = 64; + break; + case ExpWaveSizeWave32: + wsOut = 32; + break; + case ExpWaveSizeAuto: + wsOut = 0; + break; + default: + wsOut = 0; + break; + } + + return wsOut; +} + +// ===================================================================================================================== +static ExpShaderWaveSize XglSwsToExpSws( + uint32_t wsIn) +{ + ExpShaderWaveSize wsOut = ExpWaveSizeInvalid; + switch (wsIn) + { + case 64: + wsOut = ExpWaveSizeWave64; + break; + case 32: + wsOut = ExpWaveSizeWave32; + break; + case 0: + wsOut = ExpWaveSizeAuto; + break; + default: + wsOut = ExpWaveSizeInvalid; + break; + } + + return wsOut; +} + // ===================================================================================================================== // Constructor for the SettingsLoader object. VulkanSettingsLoader::VulkanSettingsLoader( - Pal::IDevice* pDevice, - Pal::IPlatform* pPlatform) + Pal::IDevice* pDevice, + Pal::IPlatform* pPlatform, + ExperimentsLoader* pExpLoader) : DevDriver::SettingsBase(&m_settings, sizeof(m_settings)), m_pDevice(pDevice), - m_pPlatform(pPlatform) + m_pPlatform(pPlatform), + m_pExperimentsLoader(pExpLoader) { } @@ -135,33 +197,213 @@ void VulkanSettingsLoader::OverrideSettingsBySystemInfo() } } +// ===================================================================================================================== +// Overrides the experiments info +void VulkanSettingsLoader::OverrideDefaultsExperimentInfo() +{ + const ExpSettings* pExpSettings = m_pExperimentsLoader->GetExpSettings(); + Pal::PalPublicSettings* pPalSettings = m_pDevice->GetPublicSettings(); + + VK_SET_VAL_IF_EXPERIMENT_ENABLED(MeshShaderSupport, enableMeshShaders, false); + +#if VKI_RAY_TRACING + VK_SET_VAL_IF_EXPERIMENT_ENABLED(RayTracingSupport, enableRaytracingSupport, false); +#endif + + VK_SET_VAL_IF_EXPERIMENT_ENABLED(Native16BitTypesSupport, enableNative16BitTypes, false); + + VK_SET_VAL_IF_EXPERIMENT_ENABLED(AmdVendorExtensions, disableAmdVendorExtensions, true); + + VK_SET_VAL_IF_EXPERIMENT_ENABLED(ComputeQueueSupport, asyncComputeQueueLimit, 0); + + if (pExpSettings->expBarrierOptimizations.ValueOr(false)) + { + pPalSettings->pwsMode = Pal::PwsMode::Disabled; + m_settings.useAcquireReleaseInterface = false; + } + + if (pExpSettings->expShaderCompilerOptimizations.ValueOr(false)) + { + m_settings.disableLoopUnrolls = true;; + } + +#if VKI_RAY_TRACING + if (pExpSettings->expAccelStructureOpt.ValueOr(false)) + { + m_settings.rtEnableTreeRebraid = RebraidTypeOff; + m_settings.rtEnableTriangleSplitting = false; + m_settings.rtEnableTopDownBuild = false; + m_settings.rtBvhBuildModeFastBuild = BvhBuildModeLinear; + m_settings.enablePairCompressionCostCheck = true; + } +#endif + + if (pExpSettings->expVsWaveSize.HasValue()) + { + m_settings.vsWaveSize = ExpSwsToXglSws(pExpSettings->expVsWaveSize.Value()); + } + + if (pExpSettings->expTcsWaveSize.HasValue()) + { + m_settings.tcsWaveSize = ExpSwsToXglSws(pExpSettings->expTcsWaveSize.Value()); + } + + if (pExpSettings->expTesWaveSize.HasValue()) + { + m_settings.tesWaveSize = ExpSwsToXglSws(pExpSettings->expTesWaveSize.Value()); + } + + if (pExpSettings->expGsWaveSize.HasValue()) + { + m_settings.gsWaveSize = ExpSwsToXglSws(pExpSettings->expGsWaveSize.Value()); + } + + if (pExpSettings->expFsWaveSize.HasValue()) + { + m_settings.fsWaveSize = ExpSwsToXglSws(pExpSettings->expFsWaveSize.Value()); + } + + if (pExpSettings->expCsWaveSize.HasValue()) + { + m_settings.csWaveSize = ExpSwsToXglSws(pExpSettings->expCsWaveSize.Value()); + } + + if (pExpSettings->expMsWaveSize.HasValue()) + { + m_settings.meshWaveSize = ExpSwsToXglSws(pExpSettings->expMsWaveSize.Value()); + } + +#if VKI_RAY_TRACING + if (pExpSettings->expRayTracingPipelineCompilationMode.ValueOr(false)) + { + m_settings.rtCompileMode = RtCompileModeIndirect; + } +#endif + + if (pExpSettings->expShaderCache.ValueOr(false)) + { + m_settings.shaderCacheMode = ShaderCacheDisable; + m_settings.usePalPipelineCaching = false; + m_settings.allowExternalPipelineCacheObject = false; + } + + VK_SET_VAL_IF_EXPERIMENT_ENABLED(TextureColorCompression, forceEnableDcc, ForceDisableDcc); + + PAL_SET_VAL_IF_EXPERIMENT_ENABLED(ZeroUnboundDescriptors, zeroUnboundDescDebugSrd, true); + + VK_SET_VAL_IF_EXPERIMENT_ENABLED(ThreadSafeCommandAllocator, threadSafeAllocator, true); + + if (pExpSettings->expVerticalSynchronization.HasValue()) + { + ExpVSyncControl state = pExpSettings->expVerticalSynchronization.Value(); + if (state == ExpVSyncControlAlwaysOn) + { + m_settings.vSyncControl = VSyncControlAlwaysOn; + } + else if (state == ExpVSyncControlAlwaysOff) + { + m_settings.vSyncControl = VSyncControlAlwaysOff; + } + else + { + m_pExperimentsLoader->ReportVsyncState(ExpVSyncControlInvalid); + PAL_ASSERT_ALWAYS(); + } + } +} + +// ===================================================================================================================== +// Sets the final values for the experiments +void VulkanSettingsLoader::FinalizeExperiments() +{ + ExpSettings* pExpSettings = m_pExperimentsLoader->GetMutableExpSettings(); + Pal::PalPublicSettings* pPalSettings = m_pDevice->GetPublicSettings(); + + pExpSettings->expMeshShaderSupport = (m_settings.enableMeshShaders == false); + +#if VKI_RAY_TRACING + pExpSettings->expRayTracingSupport = (m_settings.enableRaytracingSupport == false); +#endif + + pExpSettings->expVariableRateShadingSupport = (m_settings.enableVariableRateShading == false); + + pExpSettings->expNative16BitTypesSupport = (m_settings.enableNative16BitTypes == false); + + pExpSettings->expAmdVendorExtensions = m_settings.disableAmdVendorExtensions; + + pExpSettings->expComputeQueueSupport = (m_settings.asyncComputeQueueLimit == 0); + + pExpSettings->expBarrierOptimizations = ((pPalSettings->pwsMode == Pal::PwsMode::Disabled) && + (m_settings.useAcquireReleaseInterface == false)); + + pExpSettings->expVsWaveSize = XglSwsToExpSws(m_settings.vsWaveSize); + + pExpSettings->expTcsWaveSize = XglSwsToExpSws(m_settings.tcsWaveSize); + + pExpSettings->expTesWaveSize = XglSwsToExpSws(m_settings.tesWaveSize); + + pExpSettings->expGsWaveSize = XglSwsToExpSws(m_settings.gsWaveSize); + + pExpSettings->expFsWaveSize = XglSwsToExpSws(m_settings.fsWaveSize); + + pExpSettings->expCsWaveSize = XglSwsToExpSws(m_settings.csWaveSize); + + pExpSettings->expMsWaveSize = XglSwsToExpSws(m_settings.meshWaveSize); + +#if VKI_RAY_TRACING + pExpSettings->expRayTracingPipelineCompilationMode = (m_settings.rtCompileMode == RtCompileModeIndirect); +#endif + + pExpSettings->expTextureColorCompression = m_settings.forceEnableDcc == ForceDisableDcc; + + pExpSettings->expZeroUnboundDescriptors = pPalSettings->zeroUnboundDescDebugSrd; + + pExpSettings->expThreadSafeCommandAllocator = m_settings.threadSafeAllocator; +} + +// ===================================================================================================================== +// Informs tools of unsupported experiments +void VulkanSettingsLoader::ReportUnsupportedExperiments( + Pal::DeviceProperties* pInfo) +{ + if (pInfo->gfxipProperties.flags.supportDoubleRate16BitInstructions == 0) + { + m_pExperimentsLoader->SaveUnsupportedExperiment(expNative16BitTypesSupportHash); + } + + if (pInfo->gfxipProperties.srdSizes.bvh == 0) + { +#if VKI_RAY_TRACING + m_pExperimentsLoader->SaveUnsupportedExperiment(expAccelStructureOptHash); + m_pExperimentsLoader->SaveUnsupportedExperiment(expRayTracingSupportHash); +#endif + } + + if (pInfo->gfxipProperties.flags.supportMeshShader == 0) + { + m_pExperimentsLoader->SaveUnsupportedExperiment(expMeshShaderSupportHash); + } + + if (pInfo->gfxipProperties.supportedVrsRates == 0) + { + m_pExperimentsLoader->SaveUnsupportedExperiment(expVariableRateShadingSupportHash); + } +} + // ===================================================================================================================== // Override defaults based on application profile. This occurs before any CCC settings or private panel settings are // applied. VkResult VulkanSettingsLoader::OverrideProfiledSettings( const VkAllocationCallbacks* pAllocCb, uint32_t appVersion, - AppProfile appProfile) + AppProfile appProfile, + Pal::DeviceProperties* pInfo) { VkResult result = VkResult::VK_SUCCESS; Pal::PalPublicSettings* pPalSettings = m_pDevice->GetPublicSettings(); - Pal::DeviceProperties* pInfo = static_cast( - pAllocCb->pfnAllocation(pAllocCb->pUserData, - sizeof(Pal::DeviceProperties), - VK_DEFAULT_MEM_ALIGN, - VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE)); - - if (pInfo == nullptr) { - result = VkResult::VK_ERROR_OUT_OF_HOST_MEMORY; - } - - if (result == VkResult::VK_SUCCESS) - { - memset(pInfo, 0, sizeof(Pal::DeviceProperties)); - m_pDevice->GetProperties(pInfo); // By allowing the enable/disable to be set by environment variable, any third party platform owners // can enable or disable the feature based on their internal feedback and not have to wait for a driver @@ -972,7 +1214,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( if (pInfo->gfxLevel >= Pal::GfxIpLevel::GfxIp10_3) { m_settings.rtEnableCompilePipelineLibrary = false; - m_settings.rtMaxRayRecursionDepth = 2; } #if VKI_BUILD_GFX11 @@ -1112,6 +1353,16 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.forceDepthClampBasedOnZExport = true; } +#ifndef ICD_X64_BUILD + if (appProfile == AppProfile::DXVK) + { + // DXVK Tropic4/GTA4 page fault when GPL is enabled. + // It looks incorrect pipeline layout is used. Force indirect can make optimized pipeline layout compatible + // with fast-linked pipeline. + m_settings.pipelineLayoutSchemeSelectionStrategy = PipelineLayoutSchemeSelectionStrategy::ForceIndirect; + } +#endif + if (appProfile == AppProfile::AshesOfTheSingularity) { // Disable image type checking on Navi10 to avoid 2.5% loss in Ashes @@ -1359,17 +1610,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( { m_settings.disableSingleMipAnisoOverride = false; } - - if (appProfile == AppProfile::Enshrouded) - { -#if VKI_BUILD_GFX11 - if (pInfo->gfxLevel >= Pal::GfxIpLevel::GfxIp11_0) - { - } -#endif - } - - pAllocCb->pfnFree(pAllocCb->pUserData, pInfo); } return result; @@ -1412,7 +1652,10 @@ VkResult VulkanSettingsLoader::ProcessSettings( uint32_t appVersion, AppProfile* pAppProfile) { - VkResult result = VkResult::VK_SUCCESS; + VkResult result = VkResult::VK_SUCCESS; + Pal::DeviceProperties info = {}; + + m_pDevice->GetProperties(&info); // The following lines to load profile settings have been copied from g_settings.cpp static_cast(m_pDevice)->ReadSetting(pForceAppProfileEnableHashStr, @@ -1439,16 +1682,20 @@ VkResult VulkanSettingsLoader::ProcessSettings( #endif // Override defaults based on application profile - result = OverrideProfiledSettings(pAllocCb, appVersion, *pAppProfile); + result = OverrideProfiledSettings(pAllocCb, appVersion, *pAppProfile, &info); if (result == VkResult::VK_SUCCESS) { + ReportUnsupportedExperiments(&info); + // Read in the public settings from the Catalyst Control Center ReadPublicSettings(); // Read the rest of the settings from the registry ReadSettings(); + OverrideDefaultsExperimentInfo(); + // We need to override debug file paths settings to absolute paths as per system info OverrideSettingsBySystemInfo(); @@ -1656,6 +1903,10 @@ void VulkanSettingsLoader::UpdatePalSettings() pPalSettings->rpmViewsBypassMall = static_cast(m_settings.rpmViewsBypassMall); + // Allow Device Generated Commands to employ state-of-the-art CP Packet path whenever possible for optimal + // performance. Only obsolete Compute Shader path can be used otherwise. + pPalSettings->enableExecuteIndirectPacket = true; + // Controls PWS enable mode: disabled, fully enabled or partially enabled. Only takes effect if HW supports PWS and // Acq-rel barriers if (m_settings.useAcquireReleaseInterface) @@ -1710,6 +1961,9 @@ void VulkanSettingsLoader::FinalizeSettings( } GenerateSettingHash(); + + // Note this should be the last thing done when we finalize so we can capture any changes: + FinalizeExperiments(); } // ===================================================================================================================== diff --git a/icd/settings/settings.h b/icd/settings/settings.h index 1a3b74c5..b74e15ee 100644 --- a/icd/settings/settings.h +++ b/icd/settings/settings.h @@ -48,17 +48,20 @@ namespace Pal { class IDevice; class IPlatform; +struct DeviceProperties; } namespace vk { +class ExperimentsLoader; + // ===================================================================================================================== // This class is responsible for loading and processing the Vulkan runtime settings structure encapsulated in the Vulkan // Settings Loader object. class VulkanSettingsLoader : public DevDriver::SettingsBase { public: - explicit VulkanSettingsLoader(Pal::IDevice* pDevice, Pal::IPlatform* pPlatform); + explicit VulkanSettingsLoader(Pal::IDevice* pDevice, Pal::IPlatform* pPlatform, ExperimentsLoader* pExpLoader); virtual ~VulkanSettingsLoader(); Pal::Result Init(); @@ -102,10 +105,17 @@ class VulkanSettingsLoader : public DevDriver::SettingsBase VkResult OverrideProfiledSettings( const VkAllocationCallbacks* pAllocCb, uint32_t appVersion, - AppProfile appProfile); + AppProfile appProfile, + Pal::DeviceProperties* pInfo); + + void ReportUnsupportedExperiments(Pal::DeviceProperties* pInfo); void OverrideSettingsBySystemInfo(); + void OverrideDefaultsExperimentInfo(); + + void FinalizeExperiments(); + void DumpAppProfileChanges( AppProfile appProfile); @@ -113,6 +123,7 @@ class VulkanSettingsLoader : public DevDriver::SettingsBase Pal::IDevice* m_pDevice; Pal::IPlatform* m_pPlatform; + ExperimentsLoader* m_pExperimentsLoader; RuntimeSettings m_settings; Util::MetroHash::Hash m_settingsHash; }; diff --git a/icd/settings/settings_xgl.json b/icd/settings/settings_xgl.json index b7fcccd9..0f88fd4f 100644 --- a/icd/settings/settings_xgl.json +++ b/icd/settings/settings_xgl.json @@ -2906,7 +2906,7 @@ "VKI_RAY_TRACING" ], "Defaults": { - "Default": 1 + "Default": 31 }, "Type": "uint32", "Scope": "Driver" @@ -3203,12 +3203,42 @@ "VKI_RAY_TRACING" ], "Defaults": { - "Default": 1.3 + "Default": 1.15 }, "Type": "float", "Name": "RtTriangleSplittingFactor", "Scope": "Driver" }, + { + "Description": "If RtEnableTriangleSplitting is enabled, this setting will limit the maximum number of splits per triangle. A value=0 disables the setting.", + "Tags": [ + "Ray Tracing" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Defaults": { + "Default": 0 + }, + "Type": "uint32", + "Name": "RtTriangleSplittingBudgetPerTriangle", + "Scope": "Driver" + }, + { + "Description": "If RtEnableTriangleSplitting is enabled, this factor will affect the priority in triangle splitting.", + "Tags": [ + "Ray Tracing" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Defaults": { + "Default": 1.0 + }, + "Type": "float", + "Name": "RtTriangleSplittingPriority", + "Scope": "Driver" + }, { "Name": "RtEnableMortonCode30", "Description": "Enable Morton Code 30 bits", @@ -3923,6 +3953,21 @@ "Name": "RtEnableBuildParallel", "Scope": "Driver" }, + { + "Description": "When the LBVH builder is selected, enable the Fast LBVH path.", + "Tags": [ + "Ray Tracing" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Defaults": { + "Default": false + }, + "Type": "bool", + "Name": "RtEnableFastLBVH", + "Scope": "Driver" + }, { "Description": "Waves per SIMD to launch for parallel build. 0 chooses the default.", "Tags": [ @@ -3980,21 +4025,6 @@ "Default": 32 } }, - { - "Name": "RtEnableAccelerationStructureScratchMemoryDump", - "Description": "Dumps scratch memory from acceleration structures. Written to the directory specified by BaseLogDirPath.", - "Tags": [ - "Ray Tracing" - ], - "BuildTypes": [ - "VKI_RAY_TRACING" - ], - "Defaults": { - "Default": false - }, - "Type": "bool", - "Scope": "Driver" - }, { "Name": "RtEnableBuildAccelStructStats", "Description": "Dump built acceleration stats. (Pending implementation)", @@ -5318,21 +5348,6 @@ "Name": "RtIndirectVgprLimit", "Scope": "Driver" }, - { - "Description": "Acceleration structures used in TraceRay are tracked for dumping purposes. Disables tracking during Build calls.", - "Tags": [ - "Ray Tracing" - ], - "BuildTypes": [ - "VKI_RAY_TRACING" - ], - "Defaults": { - "Default": false - }, - "Type": "bool", - "Name": "EnableTraceRayAccelStructTracking", - "Scope": "Driver" - }, { "Description": "Force rebuild for acceleration structure updates.", "Tags": [ @@ -8743,9 +8758,6 @@ "Tags": [ "Debugging" ], - "BuildTypes": [ - "DEBUG" - ], "Defaults": { "Default": false }, @@ -8758,9 +8770,6 @@ "Tags": [ "Debugging" ], - "BuildTypes": [ - "DEBUG" - ], "Defaults": { "Default": "" }, @@ -8773,9 +8782,6 @@ "Tags": [ "Debugging" ], - "BuildTypes": [ - "DEBUG" - ], "Defaults": { "Default": 0 }, @@ -8945,7 +8951,7 @@ "Name": "ReportSuboptimalPresentAsOutOfDate" }, { - "Name": "ExportNVComputeShaderDerivatives", + "Name": "ExportNvComputeShaderDerivatives", "Description": "Export extension NV_compute_shader_derivatives", "Tags": [ "General" @@ -8956,6 +8962,18 @@ "Type": "bool", "Scope": "Driver" }, + { + "Name": "ExportNvDeviceGeneratedCommands", + "Description": "Export extension NV_device_generated_commands and NV_device_generated_commands_compute", + "Tags": [ + "General" + ], + "Defaults": { + "Default": false + }, + "Type": "bool", + "Scope": "Driver" + }, { "Name": "ExportImageCompressionControl", "Description": "Export extension VK_EXT_image_compression_control",