From 7b96a5b43e667323f627a873f33d35d6fea6245c Mon Sep 17 00:00:00 2001
From: estebanpadilla <estebanpadilla@meta.com>
Date: Mon, 29 Jul 2024 08:04:06 -0700
Subject: [PATCH] Add 3D Texture Bandwidth metric (#4336)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4336

This diff introduces a profiler that obtains the maximum and minimum bandwidth for reading unique addresses from 3D textures in each of its dimensions, using the following shader, where A is a 3D texture and B is a writeonly buffer.

The calculation of the texel position will depend on the dimension that is being benchmarked

x : pos = ivec3(offset, 0, 0)
y : pos = ivec3(0, offset, 0)
z : pos = ivec3(0, 0, offset)

  void main() {
    vec4 sum = vec4(0);
    const uint workgroup_width = local_group_size * niter * ${NUNROLL};
    uint offset = (gl_WorkGroupID[0] * workgroup_width  + gl_LocalInvocationID[0]) & addr_mask;

    int i = 0;
    for (; i < niter; ++i)
    {
        sum *= texelFetch(A, pos, 0);
        offset = (offset + local_group_size) & addr_mask;
        ...
        ...
        sum *= texelFetch(A, pos, 0);
        offset = (offset + local_group_size) & addr_mask;
    }

    vec4 zero = vec4(i>>31);

    B[gl_LocalInvocationID[0]] = sum + zero;
  }

The address mask allows us to control how many unique addresses we are accessing. If the number of unique vectors we want to read is 3, the offset will jump between three unique addresses throughout the iterations, giving us the bandwidth for that specific size of data. If the size of the unique data read is larger than the work group size, then each run will have its own block of data to read, defined by the initial offset calculation, where the offset is obtained through the workgroup ID and the local invocation ID.

Finally, we make sure to use the `sum` and `i	` variables so that the compiler's optimizer does not flatten the loops.

For a Samsung S22, the bandwidth behaves like this for each of the dimensions.
{F1767497386}

Comparing the bandwidth for the X dimension to OpenCL, we can observe that, although the behavior is the same, Vulkan has an increased bandwidth for most access sizes.

{F1767497972}

Comparing to the bandwidth for buffers, we can observe that the bandwidth is similar to regular buffers, but still much smaller than UBOs at small access sizes.

 {F1767497707}

Differential Revision: https://internalfb.com/D59980139
---
 .../tools/gpuinfo/glsl/tex_bandwidth.glsl     |  59 +++++++++
 .../tools/gpuinfo/glsl/tex_bandwidth.yaml     |  15 +++
 backends/vulkan/tools/gpuinfo/src/app.cpp     | 112 ++++++++++++++++++
 3 files changed, 186 insertions(+)
 create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
 create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml

diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
new file mode 100644
index 00000000000..d848fc04754
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_sampler(0, "r", "A", DTYPE)}
+${layout_declare_buffer(1, "w", "B", DTYPE, "PRECISION", False)}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int niter = 1;
+layout(constant_id = 4) const int nvec = 1;
+layout(constant_id = 5) const int local_group_size = 1;
+
+void main() {
+    // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
+    // This will help us limit address accessing to a specific set of unique
+    // addresses depending on the access size we want to measure.
+    const int addr_mask = nvec - 1;
+    vec4 sum = vec4(0);
+
+    // This is to distribute the accesses to unique addresses across the workgroups, once the
+    // size of the access excedes the workgroup width.
+    const uint workgroup_width = local_group_size * niter * ${NUNROLL};
+    uint offset = (gl_WorkGroupID[0] * workgroup_width  + gl_LocalInvocationID[0]) & addr_mask;
+
+    int i = 0;
+    for (; i < niter; ++i){
+      VEC4_T in_texel;
+      $for j in range(int(NUNROLL)):
+        $if DIM == 0:
+            in_texel = texelFetch(A, ivec3(offset, 0, 0), 0);
+        $elif DIM == 1:
+            in_texel = texelFetch(A, ivec3(0, offset, 0), 0);
+        $elif DIM == 2:
+            in_texel = texelFetch(A, ivec3(0, 0, offset), 0);
+
+        sum *= in_texel;
+
+        // On each unroll, a new unique address will be accessed through the offset,
+        // limited by the address mask to a specific set of unique addresses
+        offset = (offset + local_group_size) & addr_mask;
+    }
+
+    // This is to ensure no compiler optimizations occur
+    vec4 zero = vec4(i>>31);
+
+    B[gl_LocalInvocationID[0]] = sum + zero;
+}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml
new file mode 100644
index 00000000000..84da6938fd4
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+tex_bandwidth:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NUNROLL: "16"
+  generate_variant_forall:
+    DIM:
+      - RANGE: [0, 2]
+  shader_variants:
+    - NAME: tex_bandwidth
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
index 8facdb51601..92eef840687 100644
--- a/backends/vulkan/tools/gpuinfo/src/app.cpp
+++ b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -22,6 +22,9 @@ class App {
   uint32_t sm_count_;
   uint32_t nthread_logic_;
   uint32_t subgroup_size_;
+  uint32_t max_tex_width_;
+  uint32_t max_tex_height_;
+  uint32_t max_tex_depth_;
 
  public:
   App() {
@@ -36,6 +39,9 @@ class App {
     nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
     buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
     max_shared_mem_size_ = cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
+    max_tex_width_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_WIDTH>();
+    max_tex_height_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_HEIGHT>();
+    max_tex_depth_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_DEPTH>();
 
     VkPhysicalDeviceSubgroupProperties subgroup_props{};
     VkPhysicalDeviceProperties2 props2{};
@@ -54,6 +60,9 @@ class App {
     std::cout << "Cache Size," << buf_cache_size_ << std::endl;
     std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl;
     std::cout << "SubGroup Size," << subgroup_size_ << std::endl;
+    std::cout << "MaxTexWidth," << max_tex_width_ << std::endl;
+    std::cout << "MaxTexHeight," << max_tex_height_ << std::endl;
+    std::cout << "MaxTexDepth," << max_tex_depth_ << std::endl;
   }
 
   void reg_count() {
@@ -308,6 +317,15 @@ class App {
               << std::endl;
   }
 
+  std::vector<int64_t> _whd_to_nchw(std::vector<int64_t> sizes) {
+    const int64_t W = sizes[0];
+    const int64_t H = sizes[1];
+    const int64_t D = sizes[2];
+
+    // Channels-packed: {W, H, D} = {W, H, (C / 4) * N}
+    return {1, D * 4, H, W};
+  }
+
  public:
   void buf_bandwidth() {
     std::cout << "\n------ Memory Bandwidth ------" << std::endl;
@@ -323,12 +341,105 @@ class App {
     const uint32_t RANGE = 128 * 1024 * 1024;
     _bandwidth("UBO", RANGE);
   }
+
   void shared_mem_bandwidth() {
     std::cout << "\n------ Shared Bandwidth ------" << std::endl;
     const uint32_t RANGE = max_shared_mem_size_;
     _bandwidth("Shared", RANGE);
   }
 
+  void tex_bandwidth() {
+    for (int dim = 0; dim < 3; dim++) {
+      std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
+                << std::endl;
+      const uint32_t MAX_SIZE = dim == 0 ? max_tex_width_
+          : dim == 1                     ? max_tex_height_
+                                         : max_tex_depth_;
+
+      // rgba, float
+      const uint32_t VEC_WIDTH = 4;
+      const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
+      const uint32_t NVEC = MAX_SIZE;
+
+      const uint32_t RANGE = NVEC * VEC_SIZE;
+
+      // Cache lines flushed
+      const uint32_t NFLUSH = 4;
+      // Number of loop unrolls. Changing this value requires an equal change in
+      // tex_bandwidth.yaml
+      const uint32_t NUNROLL = 16;
+      // Number of iterations. Increasing this value reduces noise in exchange
+      // for higher latency.
+      const uint32_t NITER = 10;
+      // Number of memory reads per thread
+      const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
+      // Number of threads needed to read all texells
+      const uint32_t NTHREAD = NVEC;
+      // Occupy all threads
+      const uint32_t local_x = nthread_logic_;
+      // Ensure that global is a multiple of local, and distribute across all
+      // SMs
+      const uint32_t global_x =
+          (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH;
+
+      auto shader_name = "tex_bandwidth_" + std::to_string(dim);
+
+      std::vector<int64_t> sizes_whd = {MAX_SIZE, 1, 1};
+      if (dim == 1) {
+        sizes_whd = {1, MAX_SIZE, 1};
+      } else if (dim == 2) {
+        sizes_whd = {1, 1, MAX_SIZE};
+      }
+      auto sizes_nchw = _whd_to_nchw(sizes_whd);
+
+      vTensor in_tensor =
+          api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
+
+      auto bench = [&](uint32_t access_size, uint32_t dim) {
+        // Number of texels that fit in this iteration
+        const uint32_t ntexel_access = access_size / VEC_SIZE;
+
+        StorageBuffer out_buf(
+            context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_);
+        vkapi::PipelineBarrier pipeline_barrier{};
+
+        auto time = benchmark_on_gpu(shader_name, 10, [&]() {
+          context()->submit_compute_job(
+              VK_KERNEL_FROM_STR(shader_name),
+              pipeline_barrier,
+              {global_x, 1, 1},
+              {local_x, 1, 1},
+              {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)},
+              VK_NULL_HANDLE,
+              0,
+              in_tensor.image(),
+              out_buf.buffer());
+        });
+
+        const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
+        double gbps = SIZE_TRANS * 1e-3 / time;
+        std::cout << "Texture bandwidth accessing \t" << access_size
+                  << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
+                  << "\tus)" << std::endl;
+        return gbps;
+      };
+
+      double max_bandwidth = 0;
+      double min_bandwidth = DBL_MAX;
+      for (uint32_t access_size = VEC_SIZE; access_size < RANGE;
+           access_size *= 2) {
+        double gbps = bench(access_size, dim);
+        max_bandwidth = std::max(gbps, max_bandwidth);
+        min_bandwidth = std::min(gbps, min_bandwidth);
+      }
+
+      std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth
+                << std::endl;
+      std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth
+                << std::endl;
+    }
+  }
+
   // Warp size is a difficult metric to obtain because the hardware limitations
   // do not always coincide with the way the SM divides the workload. For
   // instance, the hardware can have a warp size of 64 threads, but an SM might
@@ -492,6 +603,7 @@ int main(int argc, const char** argv) {
   app.ubo_bandwidth();
   app.shared_mem_bandwidth();
   app.warp_size();
+  app.tex_bandwidth();
 
   return 0;
 }