From 1c16288f5bb5fe28fe667cc447e385cfbc4db901 Mon Sep 17 00:00:00 2001 From: Stephen Jia Date: Wed, 6 Mar 2024 16:44:52 -0800 Subject: [PATCH] Use lazy descriptor pool allocation (#2285) Summary: ## Context In Vulkan, memory for Descriptor Sets (which are used to bind data to shader arguments) must be pre-allocated. Previously, the convention is that a large number of descriptor sets are allocated upon creation of a Vulkan Context. While this worked well in Lite Interpreter, where only a global vulkan context is used, it will lead to overallocating descriptor sets in the Vulkan Delegate, where every `ComputeGraph` has its own dedicated Context. https://github.com/pytorch/pytorch/pull/121134 allows the Descriptor Set pool to be initialized in a deferred fashion. This means that a ComputeGraph can count the total number of descriptors needed across all the compute shaders that will be encoded, and then allocate a Descriptor Set Pool of the appropriate size. ## Implementation Overview 1. When constructing `ComputeGraph`, make sure that the descriptor pool config contains 0 for number of max sets. This will ensure that no descriptor pool will be initialized when constructing the graph's `api::Context` instance 2. When building the graph, `ExecuteNode` and `PrepackNode` will call `graph.update_descriptor_counts(shader)` upon construction, which allows `ComputeGraph` to count the total number of descriptor sets needed. 3. There is a separate descriptor count object for prepack and execute, since they correspond to different command buffers. 4. Before encoding any command buffers, call `graph.prepare()` which will construct a descriptor pool config from the descriptor counts. ## Notes One interesting finding is that I had to apply a safety factor to the descriptor counts to prevent the pool from running out of memory. This was reproducible on both Linux and Android. A more robust design, i.e. as discussed [here](https://www.reddit.com/r/vulkan/comments/17v66fi/question_about_descriptor_pool_allocations/) may be to maintain separate descriptor pools for each layout type. We should revisit this refactor at a later time. Reviewed By: jorgep31415 Differential Revision: D54603935 --- backends/vulkan/runtime/VulkanBackend.cpp | 37 +---------- .../vulkan/runtime/graph/ComputeGraph.cpp | 65 +++++++++++++++++++ backends/vulkan/runtime/graph/ComputeGraph.h | 13 ++++ backends/vulkan/runtime/graph/GraphConfig.cpp | 56 ++++++++++++++++ backends/vulkan/runtime/graph/GraphConfig.h | 10 +++ .../vulkan/runtime/graph/ops/ExecuteNode.cpp | 15 +++++ .../vulkan/runtime/graph/ops/ExecuteNode.h | 8 +-- .../vulkan/runtime/graph/ops/PrepackNode.cpp | 17 +++++ .../vulkan/runtime/graph/ops/PrepackNode.h | 9 +-- .../runtime/graph/ops/impl/Arithmetic.cpp | 1 + .../vulkan/runtime/graph/ops/impl/Staging.cpp | 4 +- .../vulkan/test/vulkan_compute_api_test.cpp | 51 ++++----------- 12 files changed, 198 insertions(+), 88 deletions(-) create mode 100644 backends/vulkan/runtime/graph/GraphConfig.cpp diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp index 1222ee38e5..b5d3441886 100644 --- a/backends/vulkan/runtime/VulkanBackend.cpp +++ b/backends/vulkan/runtime/VulkanBackend.cpp @@ -62,39 +62,6 @@ api::ScalarType get_scalar_type(const vkgraph::VkDataType& vk_datatype) { } } -GraphConfig generate_config() { - const uint32_t submit_frequency = UINT32_MAX; - - const api::CommandPoolConfig cmd_config{ - 4u, // cmdPoolInitialSize - 2u, // cmdPoolBatchSize - }; - - const api::DescriptorPoolConfig descriptor_pool_config{ - 1024u, // descriptorPoolMaxSets - 1024u, // descriptorUniformBufferCount - 1024u, // descriptorStorageBufferCount - 1024u, // descriptorCombinedSamplerCount - 1024u, // descriptorStorageImageCount - 32u, // descriptorPileSizes - }; - - const api::QueryPoolConfig query_pool_config{}; - - const api::ContextConfig context_config{ - submit_frequency, // cmdSubmitFrequency - cmd_config, // cmdPoolConfig - descriptor_pool_config, // descriptorPoolConfig - query_pool_config, // queryPoolConfig - }; - - const GraphConfig graph_config{ - context_config, - }; - - return graph_config; -} - class GraphBuilder { ComputeGraph* compute_graph_; VkGraphPtr flatbuffer_; @@ -269,6 +236,8 @@ class VulkanBackend final : public PyTorchBackendInterface { builder.build_graph(); + compute_graph->prepare(); + compute_graph->encode_prepack(); compute_graph->prepack(); @@ -284,7 +253,7 @@ class VulkanBackend final : public PyTorchBackendInterface { ComputeGraph* compute_graph = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR( context.get_runtime_allocator(), ComputeGraph); - new (compute_graph) ComputeGraph(generate_config()); + new (compute_graph) ComputeGraph(GraphConfig()); Error err = compileModel(processed->data(), compute_graph); diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index 0900dfb9c1..6aa9171d9f 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -18,6 +18,8 @@ namespace vulkan { ComputeGraph::ComputeGraph(GraphConfig config) : config_{config}, + prepack_descriptor_counts_{}, + execute_descriptor_counts_{}, context_{new api::Context( api::runtime()->default_adapter_i(), config_.contextConfig)}, @@ -27,6 +29,19 @@ ComputeGraph::ComputeGraph(GraphConfig config) execute_nodes_{}, inputs_{}, outputs_{} { + // Ensure that descriptor counts are initialized to 0 + prepack_descriptor_counts_.descriptorPoolMaxSets = 0; + prepack_descriptor_counts_.descriptorUniformBufferCount = 0; + prepack_descriptor_counts_.descriptorStorageBufferCount = 0; + prepack_descriptor_counts_.descriptorCombinedSamplerCount = 0; + prepack_descriptor_counts_.descriptorStorageImageCount = 0; + + execute_descriptor_counts_.descriptorPoolMaxSets = 0; + execute_descriptor_counts_.descriptorUniformBufferCount = 0; + execute_descriptor_counts_.descriptorStorageBufferCount = 0; + execute_descriptor_counts_.descriptorCombinedSamplerCount = 0; + execute_descriptor_counts_.descriptorStorageImageCount = 0; + context_->set_cmd(/*reusable = */ true); } @@ -39,6 +54,33 @@ ComputeGraph::~ComputeGraph() { context_->flush(); } +void ComputeGraph::update_descriptor_counts( + const api::ShaderInfo& shader_info, + bool execute) { + api::DescriptorPoolConfig* config = + execute ? &execute_descriptor_counts_ : &prepack_descriptor_counts_; + + config->descriptorPoolMaxSets += 1; + for (const VkDescriptorType arg_type : shader_info.kernel_layout) { + switch (arg_type) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + config->descriptorUniformBufferCount += 1; + break; + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + config->descriptorStorageBufferCount += 1; + break; + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + config->descriptorCombinedSamplerCount += 1; + break; + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + config->descriptorStorageImageCount += 1; + break; + default: + VK_THROW("Unsupported descriptor type!"); + } + } +} + ValueRef ComputeGraph::add_tensor( const std::vector& sizes, const api::ScalarType dtype, @@ -138,6 +180,29 @@ void ComputeGraph::copy_from_staging( copy_staging_to_ptr(staging, data, nbytes); } +void ComputeGraph::prepare() { +#define MERGE_FIELD(field) \ + static_cast(std::ceil( \ + std::max( \ + execute_descriptor_counts_.field, \ + prepack_descriptor_counts_.field) * \ + config_.descriptorPoolSafetyFactor)) + + api::DescriptorPoolConfig config{ + MERGE_FIELD(descriptorPoolMaxSets), + MERGE_FIELD(descriptorUniformBufferCount), + MERGE_FIELD(descriptorStorageBufferCount), + MERGE_FIELD(descriptorCombinedSamplerCount), + MERGE_FIELD(descriptorStorageImageCount), + 1u, + }; + + if (!context_->descriptor_pool()) { + context_->descriptor_pool().init(config); + } +#undef MERGE_FIELD +} + void ComputeGraph::encode_prepack() { for (std::unique_ptr& node : prepack_nodes_) { node->encode(this); diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index a45e449ae2..7917304f0c 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -60,6 +60,9 @@ class ComputeGraph final { private: GraphConfig config_; + api::DescriptorPoolConfig prepack_descriptor_counts_; + api::DescriptorPoolConfig execute_descriptor_counts_; + std::unique_ptr context_; std::vector shared_objects_; std::vector values_; @@ -87,6 +90,10 @@ class ComputeGraph final { return outputs_; } + void update_descriptor_counts( + const api::ShaderInfo& shader_info, + bool execute); + /* * Returns the value at a particular reference */ @@ -163,6 +170,12 @@ class ComputeGraph final { SharedObject& get_shared_object(const int64_t idx); + // + // Graph Preparation + // + + void prepare(); + // // Input/Output // diff --git a/backends/vulkan/runtime/graph/GraphConfig.cpp b/backends/vulkan/runtime/graph/GraphConfig.cpp new file mode 100644 index 0000000000..8cda518dae --- /dev/null +++ b/backends/vulkan/runtime/graph/GraphConfig.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace at { +namespace native { +namespace vulkan { + +GraphConfig::GraphConfig() { + // No automatic submissions + const uint32_t submit_frequency = UINT32_MAX; + + // Only one command buffer will be encoded at a time + const api::CommandPoolConfig cmd_config{ + 1u, // cmdPoolInitialSize + 1u, // cmdPoolBatchSize + }; + + // Use lazy descriptor pool initialization by default; the graph runtime will + // tally up the number of descriptor sets needed while building the graph and + // trigger descriptor pool initialization with exact sizes before encoding the + // command buffer. + const api::DescriptorPoolConfig descriptor_pool_config{ + 0u, // descriptorPoolMaxSets + 0u, // descriptorUniformBufferCount + 0u, // descriptorStorageBufferCount + 0u, // descriptorCombinedSamplerCount + 0u, // descriptorStorageImageCount + 0u, // descriptorPileSizes + }; + + const api::QueryPoolConfig query_pool_config{}; + + const api::ContextConfig context_config{ + submit_frequency, // cmdSubmitFrequency + cmd_config, // cmdPoolConfig + descriptor_pool_config, // descriptorPoolConfig + query_pool_config, // queryPoolConfig + }; + + contextConfig = context_config; + + // Empirically selected safety factor. If descriptor pools start running out + // of memory, increase this safety factor. + descriptorPoolSafetyFactor = 1.25; +} + +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h index 0cb9bb6f53..e2c8d6bed0 100644 --- a/backends/vulkan/runtime/graph/GraphConfig.h +++ b/backends/vulkan/runtime/graph/GraphConfig.h @@ -18,6 +18,16 @@ namespace vulkan { struct GraphConfig final { api::ContextConfig contextConfig; + + // Creating a descriptor pool with exactly the number of descriptors tallied + // by iterating through the shader layouts of shaders used in the graph risks + // the descriptor pool running out of memory, therefore apply a safety factor + // to descriptor counts when creating the descriptor pool to mitigate this + // risk. + float descriptorPoolSafetyFactor; + + // Generate a default graph config with pre-configured settings + explicit GraphConfig(); }; } // namespace vulkan diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp index 7c1f0fe807..c9c338bc17 100644 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp @@ -16,6 +16,21 @@ namespace at { namespace native { namespace vulkan { +ExecuteNode::ExecuteNode( + ComputeGraph& graph, + const api::ShaderInfo& shader, + const api::utils::uvec3& global_workgroup_size, + const api::utils::uvec3& local_workgroup_size, + const std::vector& args, + api::UniformParamsBuffer&& params) + : shader_(shader), + global_workgroup_size_(global_workgroup_size), + local_workgroup_size_(local_workgroup_size), + args_(args), + params_(std::move(params)) { + graph.update_descriptor_counts(shader, /*execute = */ true); +} + void ExecuteNode::encode(ComputeGraph* graph) { api::Context* const context = graph->context(); api::PipelineBarrier pipeline_barrier{}; diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h index ddd50c1f67..f3c2bba9c0 100644 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h @@ -50,16 +50,12 @@ class ExecuteNode final { public: ExecuteNode( + ComputeGraph& graph, const api::ShaderInfo& shader, const api::utils::uvec3& global_workgroup_size, const api::utils::uvec3& local_workgroup_size, const std::vector& args, - api::UniformParamsBuffer&& params) - : shader_(shader), - global_workgroup_size_(global_workgroup_size), - local_workgroup_size_(local_workgroup_size), - args_(args), - params_(std::move(params)) {} + api::UniformParamsBuffer&& params); ~ExecuteNode() = default; diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp index d16c671ba4..69e6ffabd6 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp @@ -17,6 +17,23 @@ namespace at { namespace native { namespace vulkan { +PrepackNode::PrepackNode( + ComputeGraph& graph, + const api::ShaderInfo& shader, + const api::utils::uvec3& global_workgroup_size, + const api::utils::uvec3& local_workgroup_size, + const ValueRef tref, + const ValueRef packed, + api::UniformParamsBuffer&& params) + : shader_(shader), + global_workgroup_size_(global_workgroup_size), + local_workgroup_size_(local_workgroup_size), + tref_(tref), + packed_(packed), + params_(std::move(params)) { + graph.update_descriptor_counts(shader, /*execute = */ false); +} + void PrepackNode::encode(ComputeGraph* graph) { api::Context* const context = graph->context(); api::PipelineBarrier pipeline_barrier{}; diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h index b3a5fd0086..59071e9371 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.h +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.h @@ -33,18 +33,13 @@ class PrepackNode final { public: PrepackNode( + ComputeGraph& graph, const api::ShaderInfo& shader, const api::utils::uvec3& global_workgroup_size, const api::utils::uvec3& local_workgroup_size, const ValueRef tref, const ValueRef packed, - api::UniformParamsBuffer&& params) - : shader_(shader), - global_workgroup_size_(global_workgroup_size), - local_workgroup_size_(local_workgroup_size), - tref_(tref), - packed_(packed), - params_(std::move(params)) {} + api::UniformParamsBuffer&& params); ~PrepackNode() = default; diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp index f5895c1544..108ff2b2dc 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp @@ -72,6 +72,7 @@ void add_arithmetic_node( api::UniformParamsBuffer params(graph.context(), block); graph.execute_nodes().emplace_back(new ExecuteNode( + graph, shader, global_size, local_size, diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index 41104532d4..953a06426a 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -48,6 +48,7 @@ void add_staging_to_tensor_node( graph.context(), create_staging_params(t_out)); graph.execute_nodes().emplace_back(new ExecuteNode( + graph, shader, global_size, local_size, @@ -90,6 +91,7 @@ void add_tensor_to_staging_node( } graph.execute_nodes().emplace_back(new ExecuteNode( + graph, shader, global_size, local_size, @@ -112,7 +114,7 @@ ValueRef prepack(ComputeGraph& graph, const ValueRef vref) { api::UniformParamsBuffer params(graph.context(), sp); graph.prepack_nodes().emplace_back(new PrepackNode( - shader, global_size, local_size, vref, v, std::move(params))); + graph, shader, global_size, local_size, vref, v, std::move(params))); return v; } diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index c53444ff0b..5c1fc8f3c5 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -108,39 +108,6 @@ size_t get_vma_allocation_count() { return get_vma_stats().total.statistics.allocationCount; } -GraphConfig generate_graph_config() { - const uint32_t submit_frequency = UINT32_MAX; - - const api::CommandPoolConfig cmd_config{ - 4u, // cmdPoolInitialSize - 2u, // cmdPoolBatchSize - }; - - const api::DescriptorPoolConfig descriptor_pool_config{ - 1024u, // descriptorPoolMaxSets - 1024u, // descriptorUniformBufferCount - 1024u, // descriptorStorageBufferCount - 1024u, // descriptorCombinedSamplerCount - 1024u, // descriptorStorageImageCount - 32u, // descriptorPileSizes - }; - - const api::QueryPoolConfig query_pool_config{}; - - const api::ContextConfig context_config{ - submit_frequency, // cmdSubmitFrequency - cmd_config, // cmdPoolConfig - descriptor_pool_config, // descriptorPoolConfig - query_pool_config, // queryPoolConfig - }; - - const GraphConfig graph_config{ - context_config, - }; - - return graph_config; -} - // // Test Wrapper // @@ -428,7 +395,7 @@ TEST_F(VulkanComputeAPITest, use_non_bound_textures_fails) { graph.copy_from_staging(name.staging, data_##name.data(), data_##name.size()); TEST(VulkanComputeGraphTest, test_values_scalars) { - GraphConfig config = generate_graph_config(); + GraphConfig config; ComputeGraph graph(config); ValueRef idx; @@ -441,7 +408,7 @@ TEST(VulkanComputeGraphTest, test_values_scalars) { } TEST(VulkanComputeGraphTest, test_values_scalar_list_inplace_constructed) { - GraphConfig config = generate_graph_config(); + GraphConfig config; ComputeGraph graph(config); ValueRef idx = graph.add_scalar_list({1, 2, 3, 4}); @@ -453,7 +420,7 @@ TEST(VulkanComputeGraphTest, test_values_scalar_list_inplace_constructed) { } TEST(VulkanComputeGraphTest, test_values_scalar_list_outside_constructed) { - GraphConfig config = generate_graph_config(); + GraphConfig config; ComputeGraph graph(config); ValueRef idx; @@ -469,7 +436,7 @@ TEST(VulkanComputeGraphTest, test_values_scalar_list_outside_constructed) { } TEST(VulkanComputeGraphTest, test_values_string) { - GraphConfig config = generate_graph_config(); + GraphConfig config; ComputeGraph graph(config); ValueRef idx; @@ -482,7 +449,7 @@ TEST(VulkanComputeGraphTest, test_values_string) { } TEST(VulkanComputeGraphTest, test_simple_graph) { - GraphConfig config = generate_graph_config(); + GraphConfig config; ComputeGraph graph(config); std::vector size_big = {4, 4, 4}; @@ -502,6 +469,7 @@ TEST(VulkanComputeGraphTest, test_simple_graph) { out.staging = graph.set_output_tensor(out.value); + graph.prepare(); graph.encode_execute(); // Run graph @@ -531,7 +499,7 @@ TEST(VulkanComputeGraphTest, test_simple_graph) { ValueRef name = graph.add_tensorref(sizes, api::kFloat, data_##name.data()); TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) { - GraphConfig config = generate_graph_config(); + GraphConfig config; ComputeGraph graph(config); std::vector size_big = {4, 4, 4}; @@ -554,6 +522,8 @@ TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) { out.value = e; out.staging = graph.set_output_tensor(out.value); + graph.prepare(); + graph.encode_prepack(); graph.prepack(); @@ -579,7 +549,7 @@ TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) { } TEST(VulkanComputeGraphTest, test_simple_shared_objects) { - GraphConfig config = generate_graph_config(); + GraphConfig config; ComputeGraph graph(config); std::vector size_big = {4, 4, 4}; @@ -637,6 +607,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects) { // 1 staging buffer for the input tensor EXPECT_TRUE(get_vma_allocation_count() == 10); + graph.prepare(); graph.encode_execute(); // Allocation count will be 13, three shared objects are allocated for total: