diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp new file mode 100644 index 0000000000..6bdb07e719 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +namespace at { +namespace native { +namespace vulkan { + +void ExecuteNode::encode(ComputeGraph* graph) { + api::Context* const context = graph->context(); + api::PipelineBarrier pipeline_barrier{}; + + std::unique_lock cmd_lock = context->dispatch_lock(); + + api::DescriptorSet descriptor_set = + context->get_descriptor_set(shader_, local_workgroup_size_); + + uint32_t idx = 0; + idx = bind_values_to_descriptor_set( + graph, + outputs_, + pipeline_barrier, + api::MemoryAccessType::WRITE, + descriptor_set, + idx); + idx = bind_values_to_descriptor_set( + graph, + inputs_, + pipeline_barrier, + api::MemoryAccessType::READ, + descriptor_set, + idx); + descriptor_set.bind(idx, params_.buffer()); + + context->register_shader_dispatch( + descriptor_set, pipeline_barrier, shader_, global_workgroup_size_); +} + +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h index 2b8fb04cbd..1b726e73d4 100644 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h @@ -33,20 +33,37 @@ class ExecuteNode { public: ExecuteNode(ValueRef input, ValueRef output) - : inputs_{input}, outputs_{output} {} + : outputs_{output}, inputs_{input} {} + ExecuteNode( + const api::ShaderInfo& shader, + const api::utils::uvec3& global_workgroup_size, + const api::utils::uvec3& local_workgroup_size, + const std::vector& outputs, const std::vector& inputs, - const std::vector& outputs) - : inputs_(inputs), outputs_(outputs) {} + api::UniformParamsBuffer&& params) + : shader_(shader), + global_workgroup_size_(global_workgroup_size), + local_workgroup_size_(local_workgroup_size), + outputs_(outputs), + inputs_(inputs), + params_(std::move(params)) {} virtual ~ExecuteNode() = default; protected: - std::vector inputs_; + // TODO: Consider making members const after we remove StagingNode. + api::ShaderInfo shader_; + api::utils::uvec3 global_workgroup_size_; + api::utils::uvec3 local_workgroup_size_; std::vector outputs_; + std::vector inputs_; + // TODO(T180906086): pass multiple buffers and index with ValueRef. + // TODO(T180906457): allow re-computing param buffers. + api::UniformParamsBuffer params_; public: - virtual void encode(ComputeGraph* graph) const = 0; + virtual void encode(ComputeGraph* graph); }; } // namespace vulkan diff --git a/backends/vulkan/runtime/graph/ops/Utils.cpp b/backends/vulkan/runtime/graph/ops/Utils.cpp new file mode 100644 index 0000000000..579eac54e3 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/Utils.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace at { +namespace native { +namespace vulkan { + +api::utils::ivec4 get_size_as_ivec4(const vTensor& t) { + return api::utils::make_ivec4( + {dim_at(t), + dim_at(t), + dim_at(t), + dim_at(t)}); +} + +void bind_tensor_to_descriptor_set( + vTensor& tensor, + api::PipelineBarrier& pipeline_barrier, + const api::MemoryAccessType accessType, + api::DescriptorSet& descriptor_set, + const uint32_t idx) { + if (tensor.buffer()) { + api::VulkanBuffer& buffer = tensor.buffer( + pipeline_barrier, api::PipelineStage::COMPUTE, accessType); + descriptor_set.bind(idx, buffer); + } else { + api::VulkanImage& image = + tensor.image(pipeline_barrier, api::PipelineStage::COMPUTE, accessType); + descriptor_set.bind(idx, image); + } +} + +uint32_t bind_values_to_descriptor_set( + ComputeGraph* graph, + const std::vector& args, + api::PipelineBarrier& pipeline_barrier, + const api::MemoryAccessType accessType, + api::DescriptorSet& descriptor_set, + const uint32_t base_idx) { + uint32_t idx = base_idx; + for (auto& arg : args) { + Value& val = graph->get_val(arg); + if (val.isTensor()) { + vTensor& tensor = val.toTensor(); + bind_tensor_to_descriptor_set( + tensor, pipeline_barrier, accessType, descriptor_set, idx++); + } else { + VK_THROW("Unsupported type: ", val.type()); + } + } + return idx; +} + +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/backends/vulkan/runtime/graph/ops/Utils.h b/backends/vulkan/runtime/graph/ops/Utils.h index f962c17bcc..9cf214ca87 100644 --- a/backends/vulkan/runtime/graph/ops/Utils.h +++ b/backends/vulkan/runtime/graph/ops/Utils.h @@ -10,7 +10,7 @@ #ifdef USE_VULKAN_API -#include +#include #include @@ -21,6 +21,23 @@ namespace vulkan { #define DECLARE_OP_FN(function) \ ValueRef function(ComputeGraph& graph, const std::vector& args); +api::utils::ivec4 get_size_as_ivec4(const vTensor& t); + +void bind_tensor_to_descriptor_set( + vTensor& tensor, + api::PipelineBarrier& pipeline_barrier, + const api::MemoryAccessType accessType, + api::DescriptorSet& descriptor_set, + const uint32_t idx); + +uint32_t bind_values_to_descriptor_set( + ComputeGraph* graph, + const std::vector& args, + api::PipelineBarrier& pipeline_barrier, + const api::MemoryAccessType accessType, + api::DescriptorSet& descriptor_set, + const uint32_t base_idx); + } // namespace vulkan } // namespace native } // namespace at diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp index 8d316e9f48..ce43005384 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp @@ -8,44 +8,39 @@ #include -#include - #include namespace at { namespace native { namespace vulkan { -#define DEFINE_ARITHMETIC_FN(function, op_type) \ +#define DEFINE_ARITHMETIC_FN(function, shader) \ ValueRef function(ComputeGraph& graph, const std::vector& args) { \ return add_arithmetic_node( \ - graph, \ - args[0], \ - args[1], \ - args[2], \ - arithmetic::OpType::op_type, \ - args[3]); \ + graph, args[0], args[1], args[2], VK_KERNEL(shader), args[3]); \ } -DEFINE_ARITHMETIC_FN(add, ADD); -DEFINE_ARITHMETIC_FN(sub, SUB); -DEFINE_ARITHMETIC_FN(mul, MUL); -DEFINE_ARITHMETIC_FN(div, DIV); -DEFINE_ARITHMETIC_FN(floor_div, FLOOR_DIV); -DEFINE_ARITHMETIC_FN(pow, POW); +DEFINE_ARITHMETIC_FN(add, add); +DEFINE_ARITHMETIC_FN(sub, sub); +DEFINE_ARITHMETIC_FN(mul, mul); +DEFINE_ARITHMETIC_FN(div, div); +DEFINE_ARITHMETIC_FN(floor_div, floor_divide); +DEFINE_ARITHMETIC_FN(pow, pow); +// TODO(T180908843): Bypass this entrypoint function by creating `ValueRef out` +// ahead of time. ValueRef add_arithmetic_node( ComputeGraph& graph, const ValueRef in1, const ValueRef in2, const float alpha, - const arithmetic::OpType optype, + const api::ShaderInfo& shader, const int64_t shared_object_idx) { std::vector in1_sizes = graph.get_val_sizes(in1); api::ScalarType in1_dtype = graph.get_val_dtype(in1); ValueRef out = graph.add_tensor(in1_sizes, in1_dtype, shared_object_idx); - add_arithmetic_node(graph, in1, in2, out, alpha, optype); + add_arithmetic_node(graph, in1, in2, out, alpha, shader); return out; } @@ -67,12 +62,27 @@ void add_arithmetic_node( const ValueRef in2, const ValueRef out, const float alpha, - const arithmetic::OpType optype) { + const api::ShaderInfo& shader) { ValueRef arg1 = prepack_if_tensor_ref(graph, in1); ValueRef arg2 = prepack_if_tensor_ref(graph, in2); - graph.execute_nodes().emplace_back( - new ArithmeticNode(arg1, arg2, out, alpha, optype)); + vTensor& t_in1 = graph.get_val(arg1).toTensor(); + vTensor& t_in2 = graph.get_val(arg2).toTensor(); + vTensor& t_out = graph.get_val(out).toTensor(); + + api::utils::uvec3 global_size = t_out.extents(); + api::utils::uvec3 local_size = adaptive_work_group_size(global_size); + + ArithmeticParams block{ + get_size_as_ivec4(t_out), + get_size_as_ivec4(t_in1), + get_size_as_ivec4(t_in2), + 1.0, + }; + api::UniformParamsBuffer params(graph.context(), block); + + graph.execute_nodes().emplace_back(new ExecuteNode( + shader, global_size, local_size, {out}, {arg1, arg2}, std::move(params))); } ArithmeticPrepack::ArithmeticPrepack(const ValueRef tref, const ValueRef packed) @@ -92,23 +102,6 @@ void ArithmeticPrepack::encode(ComputeGraph* graph) const { encode_copy_to_vtensor(graph->context(), staging, packed); } -ArithmeticNode::ArithmeticNode( - const ValueRef in1, - const ValueRef in2, - const ValueRef out, - const float alpha, - const arithmetic::OpType optype) - : ExecuteNode({in1, in2}, {out}), alpha_(alpha), optype_(optype) {} - -void ArithmeticNode::encode(ComputeGraph* graph) const { - vTensor& in1 = graph->get_val(inputs_[0]).toTensor(); - vTensor& in2 = graph->get_val(inputs_[1]).toTensor(); - vTensor& out = graph->get_val(outputs_[0]).toTensor(); - - api::ShaderInfo kernel = arithmetic::get_shader(optype_); - arithmetic::record_op(graph->context(), kernel, in1, in2, out, alpha_); -} - } // namespace vulkan } // namespace native } // namespace at diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h index 767517043b..82e2aa2cdf 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h +++ b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h @@ -32,7 +32,7 @@ ValueRef add_arithmetic_node( const ValueRef in1, const ValueRef in2, const float alpha, - const arithmetic::OpType optype, + const api::ShaderInfo& shader, const int64_t shared_object_idx = -1); void add_arithmetic_node( @@ -41,29 +41,20 @@ void add_arithmetic_node( const ValueRef in2, const ValueRef out, const float alpha, - const arithmetic::OpType optype); + const api::ShaderInfo& shader); -class ArithmeticPrepack : public virtual PrepackNode { - public: - explicit ArithmeticPrepack(const ValueRef tref, const ValueRef packed); - - void encode(ComputeGraph* graph) const override; +struct ArithmeticParams final { + api::utils::ivec4 outputSizes; + api::utils::ivec4 input1Sizes; + api::utils::ivec4 input2Sizes; + float alpha; }; -class ArithmeticNode : public virtual ExecuteNode { +class ArithmeticPrepack : public virtual PrepackNode { public: - explicit ArithmeticNode( - const ValueRef in1, - const ValueRef in2, - const ValueRef out, - const float alpha, - const arithmetic::OpType optype); + explicit ArithmeticPrepack(const ValueRef tref, const ValueRef packed); void encode(ComputeGraph* graph) const override; - - private: - float alpha_; - arithmetic::OpType optype_; }; } // namespace vulkan diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index 459d5edf1b..5b16780777 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -100,7 +100,7 @@ void encode_copy_from_vtensor( StagingNode::StagingNode(ValueRef from, ValueRef to) : ExecuteNode(from, to) {} -void StagingNode::encode(ComputeGraph* graph) const { +void StagingNode::encode(ComputeGraph* graph) { Value& in_val = graph->get_val(inputs_[0]); Value& out_val = graph->get_val(outputs_[0]); diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h index bb9671d4e9..be57a9817f 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.h +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.h @@ -88,7 +88,7 @@ class StagingNode : public virtual ExecuteNode { public: explicit StagingNode(ValueRef from, ValueRef to); - void encode(ComputeGraph* graph) const override; + void encode(ComputeGraph* graph) override; }; } // namespace vulkan diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index b36e2352eb..0692d8c709 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -10,7 +10,7 @@ #include -#include +#include #include #include @@ -431,8 +431,7 @@ TEST(VulkanComputeGraphTest, test_simple_graph) { IOValueRef out = {}; - out.value = add_arithmetic_node( - graph, a.value, b.value, 1.0, arithmetic::OpType::ADD); + out.value = add_arithmetic_node(graph, a.value, b.value, 1.0, VK_KERNEL(add)); out.staging = graph.set_output_tensor(out.value); @@ -478,9 +477,8 @@ TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) { IOValueRef a = graph.add_input_tensor(size_big, api::kFloat); - ValueRef c = - add_arithmetic_node(graph, a.value, w1, 1.0, arithmetic::OpType::ADD); - ValueRef e = add_arithmetic_node(graph, c, w2, 1.0, arithmetic::OpType::MUL); + ValueRef c = add_arithmetic_node(graph, a.value, w1, 1.0, VK_KERNEL(add)); + ValueRef e = add_arithmetic_node(graph, c, w2, 1.0, VK_KERNEL(mul)); IOValueRef out = {}; out.value = e; @@ -528,7 +526,8 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects) { api::kFloat, /*shared_object_idx = */ 4); - // Allocation count will be 2 (1 staging buffer for each input tensor) + // Allocation count will be 2: + // 1 staging buffer for each input tensor EXPECT_TRUE(get_vma_allocation_count() == 2); ValueRef c = add_arithmetic_node( @@ -536,7 +535,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects) { a.value, b.value, 1.0, - arithmetic::OpType::ADD, + VK_KERNEL(add), /*shared_object_idx = */ 6); IOValueRef d = graph.add_input_tensor( @@ -544,29 +543,33 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects) { api::kFloat, /*shared_object_idx = */ 2); - // Allocation count will be 3 (1 staging buffer for each input tensor) - EXPECT_TRUE(get_vma_allocation_count() == 3); + // Allocation count will be 4, two are new: + // 1 uniform buffer for arithmetic shader params + // 1 staging buffer for the input tensor + EXPECT_TRUE(get_vma_allocation_count() == 4); ValueRef e = add_arithmetic_node( graph, c, d.value, 1.0, - arithmetic::OpType::MUL, + VK_KERNEL(mul), /*shared_object_idx = */ 4); IOValueRef out = {}; out.value = e; out.staging = graph.set_output_tensor(out.value); - // Allocation count will be 4 (1 staging buffer for each I/O tensor) - EXPECT_TRUE(get_vma_allocation_count() == 4); + // Allocation count will be 6, three are new: + // 1 uniform buffer for arithmetic shader params + // 1 staging buffer for the input tensor + EXPECT_TRUE(get_vma_allocation_count() == 6); graph.encode_execute(); // Allocation count will be 13: // 4 staging buffers for each I/O tensor - // 6 uniform buffers to store args for each shader dispatch + // 6 uniform buffers to store params for each shader dispatch // 3 shared objects to back tensor memory EXPECT_TRUE(get_vma_allocation_count() == 13);