Bring back extents_ubo() as texture_limits_ubo() (#3217)

Summary: Pull Request resolved: #3217 ## Context #3181 deprecated the `gpu_sizes_ubo()` and `extents_ubo()` functions of `vTensor` in order to standardize how shaders consume shape/size metadata of input tensors. However, this came at the cost of increasing the overhead required for bounds checking, which is needed to support dynamic shapes as shaders now needed to convert the input sizes to texture limits before checking if a given texel position is out of bounds. Benchmarking revealed that this overhead can be quite significant especially on lower power mobile GPUs. In the interest of preserving performance, `extents_ubo()` is re-introduced since bounds checking is an operation that is common to every single shader. However, some improvements are made: * instead of `extents`, the nomenclature `texture_limits` is used in order to differentiate from physical image extents of the texture. * `texture_limits` is represented via an `ivec3` (previously `uvec4`); this means that to use it for bounds checking, there does not need to be an implicit cast to from `uvec` to `ivec` and there is also no need for swizzling. Also introduced in this changeset is the convention of passing both the texture limits and tensor sizes instead of using `pos_out_of_bounds()`. Passing in the texture limits is probably cheaper than using `pos_out_of_bounds()`. There are some exceptions though where I choose not to migrate to this pattern to avoid passing in too many variants of tensor metadata. ### What about `gpu_sizes_ubo`? I will hold off on re-introducing `gpu_sizes_ubo` for now since converting `sizes` to `gpu_sizes` is much cheaper compared to `pos_out_of_bounds()`: ``` ivec4 sizes[packed_dim] = alignup4(sizes[packed_dim]) ``` Will perform some additional benchmarking on this to see if the overhead of the alignment warrants an explicit API for passing in GPU sizes to shaders. ghstack-source-id: 223453651 exported-using-ghexport Reviewed By: yipjustin, jorgep31415 Differential Revision: D56435574 fbshipit-source-id: 656f79eecbfc7c77cbe067df6c9ea54c51c50633
pytorch · Apr 22, 2024 · 9769386 · 9769386
1 parent dbf90c2
commit 9769386
Show file tree

Hide file tree

Showing 31 changed files with 202 additions and 135 deletions.
diff --git a/backends/vulkan/runtime/api/Tensor.cpp b/backends/vulkan/runtime/api/Tensor.cpp
@@ -139,8 +139,10 @@ vTensor::vTensor(
       // Calculate sizes and strides
       sizes_(sizes.begin(), sizes.end()),
       gpu_sizes_{calc_gpu_sizes(sizes, memory_layout_, storage_type)},
-      // Utility Uniform Buffer that can be passed to shaders as arguments
-      sizes_uniform_(context, api::utils::make_whcn_ivec4(sizes_)),
+      texture_limits_{{0, 0, 0}},
+      // Utility Uniform Buffers that can be passed to shaders as arguments
+      sizes_uniform_(),
+      texture_limits_uniform_(),
       // Construct Tensor storage
       storage_(
           context,
@@ -149,6 +151,13 @@ vTensor::vTensor(
           gpu_sizes_,
           dtype_,
           allocate_memory) {
+  if (storage_type != api::kBuffer) {
+    texture_limits_.limits = api::utils::ivec3{
+        api::utils::safe_downcast<int32_t>(storage_.extents_.data[0]),
+        api::utils::safe_downcast<int32_t>(storage_.extents_.data[1]),
+        api::utils::safe_downcast<int32_t>(storage_.extents_.data[2])};
+  }
+
   if (dtype == api::kHalf) {
     VK_CHECK_COND(
         api::context()->adapter_ptr()->has_16bit_storage(),
@@ -187,6 +196,22 @@ api::VulkanBuffer& vTensor::buffer(
   return storage_.buffer_;
 }
 
+const api::BufferBindInfo vTensor::sizes_ubo() {
+  if (!sizes_uniform_.buffer()) {
+    sizes_uniform_ = api::UniformParamsBuffer(
+        storage_.context_, api::utils::make_whcn_ivec4(sizes_));
+  }
+  return api::BufferBindInfo(sizes_uniform_.buffer());
+}
+
+const api::BufferBindInfo vTensor::texture_limits_ubo() {
+  if (!texture_limits_uniform_.buffer()) {
+    texture_limits_uniform_ =
+        api::UniformParamsBuffer(storage_.context_, texture_limits_);
+  }
+  return api::BufferBindInfo(texture_limits_uniform_.buffer());
+}
+
 VmaAllocationCreateInfo vTensor::get_allocation_create_info() const {
   switch (storage_type()) {
     case api::kBuffer:
@@ -224,7 +249,25 @@ void vTensor::bind_allocation(const api::MemoryAllocation& allocation) {
 void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
   sizes_ = new_sizes;
   gpu_sizes_ = calc_gpu_sizes(sizes_, memory_layout_, storage_type());
-  sizes_uniform_.update(api::utils::make_whcn_ivec4(sizes_));
+
+  if (storage_type() != api::kBuffer) {
+    // Calculate the extents of the image texture that would have been required
+    // for a tensor of the new sizes.
+    api::utils::uvec3 virtual_extents =
+        create_image_extents(gpu_sizes_, storage_type(), memory_layout_);
+    // Update the texture limits to reflect the new virtual extents.
+    texture_limits_.limits = api::utils::ivec3{
+        api::utils::safe_downcast<int32_t>(virtual_extents.data[0]),
+        api::utils::safe_downcast<int32_t>(virtual_extents.data[1]),
+        api::utils::safe_downcast<int32_t>(virtual_extents.data[2])};
+  }
+
+  if (sizes_uniform_.buffer()) {
+    sizes_uniform_.update(api::utils::make_whcn_ivec4(sizes_));
+  }
+  if (texture_limits_uniform_.buffer()) {
+    texture_limits_uniform_.update(texture_limits_);
+  }
 }
 
 void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
@@ -236,6 +279,8 @@ void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
 }
 
 void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
+  // For texture storage check that the current texture is large enough for the
+  // new sizes of the tensor.
   if (storage_type() != api::kBuffer) {
     api::utils::uvec3 virtual_extents =
         create_image_extents(gpu_sizes_, storage_type(), memory_layout_);

diff --git a/backends/vulkan/runtime/api/Tensor.h b/backends/vulkan/runtime/api/Tensor.h
@@ -94,6 +94,13 @@ class vTensorStorage final {
 };
 
 class vTensor final {
+  struct TextureLimits {
+    // Alignment is required to conform with Vulkan specification; a 3 or 4
+    // component vector with components of size N must have base alignment of
+    // 4N.
+    alignas(16) api::utils::ivec3 limits;
+  };
+
  public:
   explicit vTensor(
       api::Context* context,
@@ -115,11 +122,18 @@ class vTensor final {
 
   std::vector<int64_t> sizes_;
   std::vector<int64_t> gpu_sizes_;
+  TextureLimits texture_limits_;
 
-  // A Vulkan uniform buffer containing the tensor sizes in WHCN that can be
-  // passed into a shader.
+  // A Vulkan uniform buffer containing the (W, H, C, N) tensor sizes that can
+  // be passed into a shader.
   api::UniformParamsBuffer sizes_uniform_;
 
+  // A Vulkan uniform buffer containing the texture limits derived from the
+  // tensor's current size information that can be passed into a shader. Note
+  // that the texture limits may be different from the texture's extents if the
+  // tensor has been resized with `virtual_resize()`.
+  api::UniformParamsBuffer texture_limits_uniform_;
+
   vTensorStorage storage_;
 
  public:
@@ -194,11 +208,17 @@ class vTensor final {
 
   /*
    * Get the binding information for the uniform buffer object containing the
-   * tensor sizes to use in a compute shader.
+   * tensor sizes to use in a compute shader. Note that the GPU buffer will be
+   * allocated the first time this function is called.
    */
-  inline const api::BufferBindInfo sizes_ubo() {
-    return api::BufferBindInfo(sizes_uniform_.buffer());
-  }
+  const api::BufferBindInfo sizes_ubo();
+
+  /*
+   * Get the binding information for the uniform buffer object containing the
+   * texture limits to use in a compute shader. Note that the GPU buffer will be
+   * allocated the first time this function is called.
+   */
+  const api::BufferBindInfo texture_limits_ubo();
 
   inline size_t numel() const {
     return api::utils::multiply_integers(sizes());

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
@@ -21,8 +21,8 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
 layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
@@ -44,16 +44,14 @@ layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 /*
  * Computes a 2D convolution. Each shader invocation calculates the output at
  * a single output location.
  */
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
@@ -21,8 +21,8 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
 layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
@@ -44,16 +44,14 @@ layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 /*
  * Computes a depthwise convolution. Each shader invocation calculates the
  * output at a single output location.
  */
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -21,8 +21,8 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
 layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
@@ -44,16 +44,14 @@ layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 /*
  * Computes a depthwise convolution. Each shader invocation calculates the
  * output at a single output location.
  */
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -21,8 +21,8 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
 layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
@@ -44,8 +44,6 @@ layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 /*
  * Computes a 2D pointwise convolution of an NxN output tile. Calculating an
  * output tile for pointwise convolution is more efficient because the kernel
@@ -71,7 +69,7 @@ void main() {
 
   // If the top left position is out of bounds, then this invocation will have
   // no work to do.
-  if (pos_out_of_bounds(pos[0], out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos[0], out_limits))) {
     return;
   }
 
@@ -146,7 +144,7 @@ void main() {
   }
 
   for (int i = 0; i < ${TILE_SIZE * TILE_SIZE}; ++i) {
-    if (!pos_out_of_bounds(pos[i], out_sizes, packed_dim)) {
+    if (all(lessThan(pos[i], out_limits))) {
       imageStore(image_out, pos[i], sum[i]);
     }
   }

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl
@@ -21,11 +21,11 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
 layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
-layout(set = 0, binding = 5) uniform PRECISION restrict InExtents {
+layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
   ivec4 in_sizes;
 };
 
@@ -54,7 +54,7 @@ layout(constant_id = 3) const int packed_dim = C_DIM;
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 

diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul.glsl
@@ -16,8 +16,8 @@ layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict
 layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
 layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
 
-layout(set = 0, binding = 3) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
@@ -26,12 +26,10 @@ layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int out_packed_dim = C_DIM;
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, out_packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 

diff --git a/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl
@@ -19,8 +19,8 @@ layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict
 layout(set = 0, binding = 1, ${IMAGE_FORMAT["int"]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM]["int"]} image_idx;
 layout(set = 0, binding = 2) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 3) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
@@ -36,12 +36,10 @@ layout(set = 0, binding = 5) uniform PRECISION restrict Params {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 

diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
@@ -25,22 +25,24 @@ layout(set = 0, binding = 3) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 4) uniform PRECISION sampler3D weight_in;
 layout(set = 0, binding = 5) uniform PRECISION sampler3D bias_in;
 
-layout(set = 0, binding = 6) uniform PRECISION restrict Sizes {
+layout(set = 0, binding = 6) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 7) uniform PRECISION restrict Sizes {
   ivec4 sizes;
 };
 
-layout(set = 0, binding = 7) uniform PRECISION restrict Epsilon {
+layout(set = 0, binding = 8) uniform PRECISION restrict Epsilon {
   float epsilon;
 };
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }