From f5683a6df73df84c2223fb7fefa95f0e19dbebe4 Mon Sep 17 00:00:00 2001
From: estebanpadilla <estebanpadilla@meta.com>
Date: Fri, 26 Jul 2024 11:47:43 -0700
Subject: [PATCH] Add metric for 3D texture cache line size (#4421)

Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4421

Differential Revision: https://internalfb.com/D60246121
---
 .../gpuinfo/glsl/tex_cacheline_size.glsl      |  39 ++++++
 .../gpuinfo/glsl/tex_cacheline_size.yaml      |  14 +++
 backends/vulkan/tools/gpuinfo/src/app.cpp     | 113 +++++++++++++++++-
 3 files changed, 165 insertions(+), 1 deletion(-)
 create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.glsl
 create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.yaml

diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.glsl
new file mode 100644
index 00000000000..62659c7bb88
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.glsl
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_sampler(0, "r", "in_tex", DTYPE)}
+${layout_declare_buffer(1, "w", "out_buf", DTYPE, "PRECISION", False)}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int niter = 1;
+
+void main() {
+    vec4 sum = vec4(0);
+    int i = 0;
+    for (; i < niter; ++i){
+        $if DIM == 0:
+            sum += texelFetch(in_tex, ivec3(gl_GlobalInvocationID[0], 0, 0), 0);
+        $elif DIM == 1:
+            sum +=  texelFetch(in_tex, ivec3(0, gl_GlobalInvocationID[0], 0), 0);
+        $elif DIM == 2:
+            sum +=  texelFetch(in_tex, ivec3(0, 0, gl_GlobalInvocationID[0]), 0);
+    }
+
+    // This is to ensure no compiler optimizations occur
+    vec4 zero = vec4(i>>31);
+
+    out_buf[0] = sum + zero;
+}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.yaml
new file mode 100644
index 00000000000..99002aff298
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.yaml
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+tex_cacheline_size:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DIM:
+      - RANGE: [0, 2]
+  shader_variants:
+    - NAME: tex_cacheline_size
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
index c33e8a011d9..42631702f5e 100644
--- a/backends/vulkan/tools/gpuinfo/src/app.cpp
+++ b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -291,12 +291,122 @@ class App {
     if (stride >= MAX_STRIDE) {
       std::cout << "Unable to conclude a top level buffer cacheline size."
                 << std::endl;
-      cacheline_size = MAX_STRIDE;
+      cacheline_size = MAX_STRIDE * sizeof(float);
     }
 
     std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
   }
 
+  // Textures are drastically different from buffers in terms of data layout.
+  // While buffers are a contiguous range of memory, textures are opaque objects
+  // defined by the vendor and it is possible that nearby points of data are not
+  // neighboring in memory. Likewise, data points are accessed in
+  // multi-dimensional patches instead of simple lines. This makes the stride
+  // method for figuring out the cache line size not applicable. To go around
+  // this, this experiment runs an increasing amount of threads accessing
+  // different datapoints in the texture and measures latency. If the cache line
+  // is big enough for all threads to access it at the same time, latency will
+  // be low. When there are more threads than what a single cache line can
+  // handle, a second line must be fetched, increasing latency in a measurable
+  // way. With this, we can find the cache line size of all three dimensions.
+  void tex_cacheline_size() {
+    if (!_enabled("tex_cacheline_size")) {
+      std::cout << "Skipped Texture Cacheline Size" << std::endl;
+      return;
+    }
+
+    const double COMPENSATE = _get_config("tex_cacheline_size", "compensate");
+    const double THRESHOLD = _get_config("tex_cacheline_size", "threshold");
+
+    uint32_t concur_nthread_by_dim[3];
+
+    for (int dim = 0; dim < 3; ++dim) {
+      std::cout << std::endl;
+      std::cout << "------ Texture Cacheline Size (dim = " << dim << ") ------"
+                << std::endl;
+
+      uint32_t NITER;
+
+      const uint32_t IMG_OTHER_EDGE = dim == 0 ? max_tex_width_
+          : dim == 1                           ? max_tex_height_
+                                               : max_tex_depth_;
+
+      const uint32_t MAX_NTHREAD = std::min(nthread_logic_, IMG_OTHER_EDGE);
+
+      uint32_t& concur_nthread = concur_nthread_by_dim[dim];
+
+      auto bench = [&](uint32_t nthread) {
+        std::vector<int64_t> sizes_whd = {
+            max_tex_width_, max_tex_height_, max_tex_depth_};
+
+        auto sizes_nchw = _whd_to_nchw(sizes_whd);
+
+        vTensor in_tensor =
+            api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
+
+        // Single vec4
+        StorageBuffer out_buf(context(), vkapi::kFloat, 4);
+
+        vkapi::PipelineBarrier pipeline_barrier{};
+
+        auto shader_name = "tex_cacheline_size_" + std::to_string(dim);
+
+        auto time = benchmark_on_gpu(shader_name, 100, [&]() {
+          context()->submit_compute_job(
+              VK_KERNEL_FROM_STR(shader_name),
+              pipeline_barrier,
+              {nthread, 1, 1},
+              {nthread, 1, 1},
+              {SV(NITER)},
+              VK_NULL_HANDLE,
+              0,
+              in_tensor.image(),
+              out_buf.buffer());
+        });
+        return time;
+      };
+
+      ensure_min_niter(1000, NITER, [&]() { return bench(1); });
+
+      DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+      uint32_t nthread = 1;
+      for (; nthread <= MAX_NTHREAD; ++nthread) {
+        double time = bench(nthread);
+        std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time
+                  << std::endl;
+
+        if (dj.push(time)) {
+          concur_nthread = nthread - 1;
+          std::cout << "Can concurrently access " << concur_nthread
+                    << "px with " << "minimal cost along dim=" << dim
+                    << std::endl;
+          break;
+        }
+      }
+      if (nthread >= MAX_NTHREAD) {
+        std::cout
+            << "Unable to conclude a top level texture cacheline size for dim "
+            << dim << std::endl;
+      } else {
+        concur_nthread_by_dim[dim] = concur_nthread;
+      }
+    }
+
+    uint32_t TEXEL_SIZE = 4 * sizeof(float);
+    const uint32_t concur_nthread_x = concur_nthread_by_dim[0];
+    const uint32_t concur_nthread_y = concur_nthread_by_dim[1];
+
+    uint32_t cacheline_size = TEXEL_SIZE *
+        std::max(concur_nthread_x, concur_nthread_y) /
+        std::min(concur_nthread_x, concur_nthread_y);
+
+    std::cout << "TextureCachelineSize," << cacheline_size << std::endl;
+
+    std::string cacheline_dim;
+    cacheline_dim = concur_nthread_x >= concur_nthread_y ? "X" : "Y";
+    std::cout << "TextureCachelineDim," << cacheline_dim << std::endl;
+  }
+
  private:
   void _bandwidth(std::string memtype, uint32_t range) {
     auto memtype_lower = memtype;
@@ -689,6 +799,7 @@ int main(int argc, const char** argv) {
   app.shared_mem_bandwidth();
   app.warp_size();
   app.tex_bandwidth();
+  app.tex_cacheline_size();
 
   return 0;
 }