diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index e69a4937e5..b1a02a6d2e 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -277,6 +277,14 @@ class vTensor final { return sizes_.size(); } + inline const std::vector& strides() const { + return strides_; + } + + inline const std::vector& unsqueezed_strides() const { + return unsqueezed_strides_; + } + /* * Returns a GPU buffer containing the sizes of the tensor in WHCN order. * Note that dimensions that are not present in the tensor's sizes are set to diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.glsl index fe69501f9c..9d4b18f0d1 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.glsl @@ -1,4 +1,3 @@ - #version 450 core #define PRECISION ${PRECISION} diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl new file mode 100644 index 0000000000..58796879e8 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl @@ -0,0 +1,35 @@ +#version 450 core + +#define PRECISION ${PRECISION} + +#define T ${buffer_scalar_type(DTYPE)} + +#include "indexing_utils.h" + +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +${layout_declare_tensor(0, "w", "nchw_buf", DTYPE, STORAGE)} +${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} +${layout_declare_ubo(2, "ivec4", "in_sizes")} +${layout_declare_ubo(3, "ivec4", "in_strides")} +${layout_declare_ubo(4, "int", "numel")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// This constant is unused in this shader but is kept so that the signature is +// consistent with image_to_nchw. +layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM; + +void main() { + int out_id = int(gl_GlobalInvocationID.x); + if (out_id >= numel) { + return; + } + + ivec4 t_in_idx = from_nchw_buffer_i(out_id, in_sizes); + const int in_id = to_buffer_id(t_in_idx, in_strides); + + nchw_buf[out_id] = t_in[in_id]; +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml new file mode 100644 index 0000000000..653bda9ccc --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +buffer_to_nchw: + parameter_names_with_default_values: + DTYPE: float + STORAGE: buffer + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + - VALUE: int + - VALUE: int8 + shader_variants: + - NAME: buffer_to_nchw diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h index d3264e43a2..21eadff0b3 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h @@ -41,6 +41,21 @@ */ #define alignup4(x) ((x + 3) & -4) +/* + * Input: (W, H, C, N) strides of a tensor + * Returns: the WHCN index of the fastest moving dimension + */ +int find_packed_dim(const ivec4 strides) { + int packed_dim = 0; + for (int i = 0; i <= 3; i++) { + if (strides[i] == 1) { + packed_dim = i; + break; + } + } + return packed_dim; +} + // // (w, h, c, n) Tensor Index <-> Contiguous Buffer Index Conversion // @@ -74,27 +89,49 @@ ivec4 from_nchw_buffer_i(int buf_i, ivec4 sizes) { (buf_i / (sizes.x * sizes.y * sizes.z))); } +int to_nchw_buffer_i(const ivec4 tensor_idx, const ivec4 sizes) { + return tensor_idx.w * sizes.x * sizes.y * sizes.z + + tensor_idx.z * sizes.x * sizes.y + tensor_idx.y * sizes.x + tensor_idx.x; +} + /* * Input: Texel buffer index, (W, H, C, N) strides of a tensor, which dim is * packed along a texel - * Returns: The (x, y, z, n) texel position corresponding to the first element - * of the texel at the specified buffer index + * Returns: The (w, h, c, n) tensor index corresponding to the buffer element */ -ivec4 to_tensor_idx(int buf_i, ivec4 strides, int packed_dim) { +ivec4 to_tensor_idx(int buffer_id, const ivec4 strides, const int packed_dim) { ivec4 idx; for (int i = 3; i >= 0; i--) { if (i != packed_dim) { - idx[i] = buf_i / strides[i]; - buf_i %= strides[i]; + idx[i] = buffer_id / strides[i]; + buffer_id %= strides[i]; } } - idx[packed_dim] = buf_i; + idx[packed_dim] = buffer_id; return idx; } -int to_texel_idx(const ivec4 texel_pos, ivec4 strides) { - return texel_pos.x * strides.x + texel_pos.y * strides.y + - texel_pos.z * strides.z + texel_pos.w * strides.w; +/* + * Input: Texel buffer index, (W, H, C, N) strides of a tensor + * Returns: The (w, h, c, n) tensor index corresponding to the buffer element + * + * This is a convenience overload of the above function. If the packed dim is + * not known, it can be found by finding the first dimension with a stride of 1. + * However, this process adds some overhead, so if performance is a concern then + * the above function should be used instead so that the packed dim is provided. + */ +ivec4 to_tensor_idx(int buffer_id, const ivec4 strides) { + int packed_dim = find_packed_dim(strides); + return to_tensor_idx(buffer_id, strides, packed_dim); +} + +/* + * Input: (w, h, c, n) tensor index, (W, H, C, N) strides of the tensor buffer + * Returns: the buffer index corresponding to the specified tensor index + */ +int to_buffer_id(const ivec4 tensor_idx, ivec4 strides) { + return tensor_idx.x * strides.x + tensor_idx.y * strides.y + + tensor_idx.z * strides.z + tensor_idx.w * strides.w; } // diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl new file mode 100644 index 0000000000..d861972f93 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl @@ -0,0 +1,35 @@ +#version 450 core + +#define PRECISION ${PRECISION} + +#define T ${buffer_scalar_type(DTYPE)} + +#include "indexing_utils.h" + +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_tensor(1, "r", "nchw_in", DTYPE, STORAGE)} +${layout_declare_ubo(2, "ivec4", "out_sizes")} +${layout_declare_ubo(3, "ivec4", "out_strides")} +${layout_declare_ubo(4, "int", "numel")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// This constant is unused in this shader but is kept so that the signature is +// consistent with nchw_to_image. +layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM; + +void main() { + int out_id = int(gl_GlobalInvocationID.x); + if (out_id >= numel) { + return; + } + + ivec4 out_idx = to_tensor_idx(out_id, out_strides); + const int in_id = to_nchw_buffer_i(out_idx, out_sizes); + + t_out[out_id] = nchw_in[in_id]; +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml new file mode 100644 index 0000000000..6292ef9333 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +nchw_to_buffer: + parameter_names_with_default_values: + DTYPE: float + STORAGE: buffer + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + - VALUE: int + - VALUE: int8 + shader_variants: + - NAME: nchw_to_buffer diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index b35d4b0175..b02613c208 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -26,7 +26,10 @@ void add_staging_to_tensor_node( vkapi::ParamsBindList ubos; if (graph.is_buffer_storage(out_tensor)) { - ubos.append(graph.numel_ubo(out_tensor)); + ubos.append( + {graph.sizes_ubo(out_tensor), + graph.strides_ubo(out_tensor), + graph.numel_ubo(out_tensor)}); } else { ubos.append(graph.sizes_ubo(out_tensor)); } @@ -61,7 +64,10 @@ void add_tensor_to_staging_node( vkapi::ParamsBindList ubos; if (graph.is_buffer_storage(in_tensor)) { - ubos.append(graph.numel_ubo(in_tensor)); + ubos.append( + {graph.sizes_ubo(in_tensor), + graph.strides_ubo(in_tensor), + graph.numel_ubo(in_tensor)}); } else { ubos.append(graph.sizes_ubo(in_tensor)); } @@ -105,7 +111,7 @@ ValueRef prepack( vkapi::ParamsBindList ubos; if (graph.is_buffer_storage(v)) { - ubos.append(graph.numel_ubo(v)); + ubos.append({graph.sizes_ubo(v), graph.strides_ubo(v), graph.numel_ubo(v)}); } else { ubos.append(graph.sizes_ubo(v)); } diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp index daec2666f8..294e36b9a8 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp @@ -107,7 +107,7 @@ vkapi::ShaderInfo get_nchw_to_tensor_shader( } if (v_dst.storage_type() == utils::kBuffer) { - kernel_name = "buffer_to_buffer"; + kernel_name = "nchw_to_buffer"; add_dtype_suffix(kernel_name, v_dst); return VK_KERNEL_FROM_STR(kernel_name); } @@ -131,7 +131,7 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader( } if (v_src.storage_type() == utils::kBuffer) { - kernel_name = "buffer_to_buffer"; + kernel_name = "buffer_to_nchw"; add_dtype_suffix(kernel_name, v_src); return VK_KERNEL_FROM_STR(kernel_name); } diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index 29cd7bf995..e6f2863470 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -23,15 +23,13 @@ void record_nchw_to_buffer_op( vkapi::VulkanBuffer& src_buffer, api::vTensor& v_dst) { vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::SpecVarList specialization_constants = { - SV(v_dst.packed_dim_whcn_idx())}; context->submit_compute_job( get_nchw_to_tensor_shader(v_dst), pipeline_barrier, {uint32_t(v_dst.numel()), 1, 1}, {64, 1, 1}, - specialization_constants, + {}, VK_NULL_HANDLE, 0, v_dst.buffer( @@ -39,6 +37,8 @@ void record_nchw_to_buffer_op( vkapi::PipelineStage::COMPUTE, vkapi::MemoryAccessType::WRITE), src_buffer, + v_dst.sizes_ubo(), + v_dst.strides_ubo(), v_dst.numel_ubo()); } @@ -47,19 +47,18 @@ void record_buffer_to_nchw_op( api::vTensor& v_src, vkapi::VulkanBuffer& dst_buffer) { vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::SpecVarList specialization_constants = { - SV(v_src.packed_dim_whcn_idx())}; - context->submit_compute_job( get_tensor_to_nchw_shader(v_src), pipeline_barrier, {uint32_t(v_src.numel()), 1, 1}, {64, 1, 1}, - specialization_constants, + {}, VK_NULL_HANDLE, 0, dst_buffer, v_src.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE), + v_src.sizes_ubo(), + v_src.strides_ubo(), v_src.numel_ubo()); }