From 32d485634d59214d4202a634b3e927a29cee11aa Mon Sep 17 00:00:00 2001
From: Esteban Padilla Cerdio <estebanpadilla@meta.com>
Date: Tue, 30 Jul 2024 10:44:32 -0700
Subject: [PATCH] Move calculations away from GPU in Bandwidth profilers
 (#4445)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4445

This is simply to have a more accurate result when doing bandwidth profiling by removing calculations that can be done outside the shader, leaving only the read operations behind.

Reviewed By: copyrightly

Differential Revision: D60396870
---
 .../vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl  | 13 +++++--------
 .../vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl  | 13 +++++--------
 backends/vulkan/tools/gpuinfo/include/buffers.h   | 15 ++++++++++++++-
 backends/vulkan/tools/gpuinfo/include/textures.h  | 15 ++++++++++++++-
 4 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl
index c16ad5d14ba..38c9befec6f 100644
--- a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl
+++ b/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl
@@ -26,6 +26,11 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 layout(constant_id = 3) const int niter = 1;
 layout(constant_id = 4) const int nvec = 1;
 layout(constant_id = 5) const int local_group_size = 1;
+// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
+// This will help us limit address accessing to a specific set of unique
+// addresses depending on the access size we want to measure.
+layout(constant_id = 6) const int addr_mask = 1;
+layout(constant_id = 7) const int workgroup_width = 1;
 
 $if MEMTYPE == "shared":
     shared vec4 A[nvec];
@@ -36,15 +41,7 @@ void main() {
         A[gl_LocalInvocationID[0]][0] = gl_LocalInvocationID[0];
         memoryBarrierShared();
 
-    // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
-    // This will help us limit address accessing to a specific set of unique
-    // addresses depending on the access size we want to measure.
-    const int addr_mask = nvec - 1;
     vec4 sum = vec4(0);
-
-    // This is to distribute the accesses to unique addresses across the workgroups, once the
-    // size of the access excedes the workgroup width.
-    const uint workgroup_width = local_group_size * niter * ${NUNROLL};
     uint offset = (gl_WorkGroupID[0] * workgroup_width  + gl_LocalInvocationID[0]) & addr_mask;
 
     int i = 0;
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
index d848fc04754..7ab67bd2d0a 100644
--- a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
+++ b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
@@ -21,17 +21,14 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 layout(constant_id = 3) const int niter = 1;
 layout(constant_id = 4) const int nvec = 1;
 layout(constant_id = 5) const int local_group_size = 1;
+// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
+// This will help us limit address accessing to a specific set of unique
+// addresses depending on the access size we want to measure.
+layout(constant_id = 6) const int addr_mask = 1;
+layout(constant_id = 7) const int workgroup_width = 1;
 
 void main() {
-    // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
-    // This will help us limit address accessing to a specific set of unique
-    // addresses depending on the access size we want to measure.
-    const int addr_mask = nvec - 1;
     vec4 sum = vec4(0);
-
-    // This is to distribute the accesses to unique addresses across the workgroups, once the
-    // size of the access excedes the workgroup width.
-    const uint workgroup_width = local_group_size * niter * ${NUNROLL};
     uint offset = (gl_WorkGroupID[0] * workgroup_width  + gl_LocalInvocationID[0]) & addr_mask;
 
     int i = 0;
diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h
index 8cb0da49ca8..c8cf93c4a12 100644
--- a/backends/vulkan/tools/gpuinfo/include/buffers.h
+++ b/backends/vulkan/tools/gpuinfo/include/buffers.h
@@ -123,6 +123,15 @@ void _bandwidth(
     // Number of vectors that fit in this iteration
     const uint32_t nvec_access = access_size / VEC_SIZE;
 
+    // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
+    // This will help us limit address accessing to a specific set of unique
+    // addresses depending on the access size we want to measure.
+    const uint32_t addr_mask = nvec_access - 1;
+
+    // This is to distribute the accesses to unique addresses across the
+    // workgroups, once the size of the access excedes the workgroup width.
+    const uint32_t workgroup_width = local_x * NITER * NUNROLL;
+
     StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
     StorageBuffer out_buf(
         context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
@@ -136,7 +145,11 @@ void _bandwidth(
           pipeline_barrier,
           {global_x, 1, 1},
           {local_x, 1, 1},
-          {SV(NITER), SV(nvec_access), SV(local_x)},
+          {SV(NITER),
+           SV(nvec_access),
+           SV(local_x),
+           SV(addr_mask),
+           SV(workgroup_width)},
           VK_NULL_HANDLE,
           0,
           in_buf.buffer(),
diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h
index bb8a3371a96..7679f11b0ca 100644
--- a/backends/vulkan/tools/gpuinfo/include/textures.h
+++ b/backends/vulkan/tools/gpuinfo/include/textures.h
@@ -164,6 +164,15 @@ void tex_bandwidth(const App& app) {
       // Number of texels that fit in this iteration
       const uint32_t ntexel_access = access_size / VEC_SIZE;
 
+      // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
+      // This will help us limit address accessing to a specific set of unique
+      // addresses depending on the access size we want to measure.
+      const uint32_t addr_mask = ntexel_access - 1;
+
+      // This is to distribute the accesses to unique addresses across the
+      // workgroups, once the size of the access excedes the workgroup width.
+      const uint32_t workgroup_width = local_x * NITER * NUNROLL;
+
       StorageBuffer out_buf(
           context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
       vkapi::PipelineBarrier pipeline_barrier{};
@@ -174,7 +183,11 @@ void tex_bandwidth(const App& app) {
             pipeline_barrier,
             {global_x, 1, 1},
             {local_x, 1, 1},
-            {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)},
+            {SV(NITER),
+             SV(ntexel_access),
+             SV(local_x),
+             SV(addr_mask),
+             SV(workgroup_width)},
             VK_NULL_HANDLE,
             0,
             in_tensor.image(),