Add metric for 3D texture max concurrent cache read (#4421)

Summary: Pull Request resolved: #4421 This diff introduces a metric to calculate the maximum concurrent cache line accesses for each dimension of a 3D texture. The experiment works by allowing each thread to access a different texel on the texture and slowly increasing the number of threads, until the cache line is no longer able to handle all simultaneous accesses. By detecting a jump in latency, we can define the optimal maximum size that can be accessed concurrently on each dimension. NOTE: ArchProbe uses this information to[ obtain a supposed cache line size for textures](https://fburl.com/98xiou3g). However, it is unclear why they define the cache line size as being the ratio between the larger concurrency value over the lower, times the texel size. It is also unclear how to extend their calculations to three dimensions. TODO: Understand the relationship between concurrency and cache line size, and modify this metric to output the cache line size. For a Samsung S22, the latency graph looks like this: {F1780375117} Reviewed By: copyrightly Differential Revision: D60246121 fbshipit-source-id: c2bac010077bf14e95f70bb6038acbb47a534dde
pytorch · Jul 30, 2024 · 5867129 · 5867129
1 parent 298b625
commit 5867129
Show file tree

Hide file tree

Showing 4 changed files with 155 additions and 1 deletion.
diff --git a/backends/vulkan/tools/gpuinfo/config.json b/backends/vulkan/tools/gpuinfo/config.json
@@ -39,5 +39,10 @@
     "nflush": 4,
     "nunroll": 16,
     "niter": 10
+  },
+  "tex_cacheline_concurr": {
+    "enabled": true,
+    "threshold": 3,
+    "compensate": 0.1
   }
 }
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_sampler(0, "r", "in_tex", DTYPE)}
+${layout_declare_buffer(1, "w", "out_buf", DTYPE, "PRECISION", False)}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int niter = 1;
+
+void main() {
+    vec4 sum = vec4(0);
+    int i = 0;
+    for (; i < niter; ++i){
+        $if DIM == 0:
+            sum += texelFetch(in_tex, ivec3(gl_GlobalInvocationID[0], 0, 0), 0);
+        $elif DIM == 1:
+            sum +=  texelFetch(in_tex, ivec3(0, gl_GlobalInvocationID[0], 0), 0);
+        $elif DIM == 2:
+            sum +=  texelFetch(in_tex, ivec3(0, 0, gl_GlobalInvocationID[0]), 0);
+    }
+
+    // This is to ensure no compiler optimizations occur
+    vec4 zero = vec4(i>>31);
+
+    out_buf[0] = sum + zero;
+}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+tex_cacheline_concurr:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DIM:
+      - RANGE: [0, 2]
+  shader_variants:
+    - NAME: tex_cacheline_concurr
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -291,12 +291,107 @@ class App {
     if (stride >= MAX_STRIDE) {
       std::cout << "Unable to conclude a top level buffer cacheline size."
                 << std::endl;
-      cacheline_size = MAX_STRIDE;
+      cacheline_size = MAX_STRIDE * sizeof(float);
     }
 
     std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
   }
 
+  // Textures are drastically different from buffers in terms of data layout.
+  // While buffers are a contiguous range of memory, textures are opaque objects
+  // defined by the vendor and it is possible that nearby points of data are not
+  // neighboring in memory. Likewise, data points are accessed in
+  // multi-dimensional patches instead of simple lines. This makes the stride
+  // method for figuring out the cache line size not applicable. To go around
+  // this, this experiment runs an increasing amount of threads accessing
+  // different datapoints in the texture and measures latency. If the cache line
+  // is big enough to contain all requested data for the amount of threads,
+  // latency will be low. When there are more threads and hence more data than
+  // what a single cache line can handle, a second line must be fetched,
+  // increasing latency in a measurable way.
+  void tex_cacheline_concurr() {
+    if (!_enabled("tex_cacheline_concurr")) {
+      std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl;
+      return;
+    }
+
+    const uint32_t TEXEL_WIDTH = 4;
+    const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH;
+
+    const double COMPENSATE =
+        _get_config("tex_cacheline_concurr", "compensate");
+    const double THRESHOLD = _get_config("tex_cacheline_concurr", "threshold");
+
+    for (int dim = 0; dim < 3; ++dim) {
+      std::cout << std::endl;
+      std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim
+                << ") ------" << std::endl;
+
+      uint32_t NITER;
+
+      const uint32_t IMG_OTHER_EDGE = dim == 0 ? max_tex_width_
+          : dim == 1                           ? max_tex_height_
+                                               : max_tex_depth_;
+
+      const uint32_t MAX_NTHREAD = std::min(nthread_logic_, IMG_OTHER_EDGE);
+
+      auto bench = [&](uint32_t nthread) {
+        std::vector<int64_t> sizes_whd = {
+            max_tex_width_, max_tex_height_, max_tex_depth_};
+
+        auto sizes_nchw = _whd_to_nchw(sizes_whd);
+
+        vTensor in_tensor =
+            api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
+
+        StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
+
+        vkapi::PipelineBarrier pipeline_barrier{};
+
+        auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim);
+
+        auto time = benchmark_on_gpu(shader_name, 100, [&]() {
+          context()->submit_compute_job(
+              VK_KERNEL_FROM_STR(shader_name),
+              pipeline_barrier,
+              {nthread, 1, 1},
+              {nthread, 1, 1},
+              {SV(NITER)},
+              VK_NULL_HANDLE,
+              0,
+              in_tensor.image(),
+              out_buf.buffer());
+        });
+        return time;
+      };
+
+      ensure_min_niter(1000, NITER, [&]() { return bench(1); });
+
+      DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+      uint32_t nthread = 1;
+      for (; nthread <= MAX_NTHREAD; ++nthread) {
+        double time = bench(nthread);
+        std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time
+                  << std::endl;
+
+        if (dj.push(time)) {
+          auto max_concurrency = nthread - 1;
+          std::cout << "TextureCachelineConcurrencyDim" << dim << " (B),"
+                    << max_concurrency * TEXEL_SIZE << std::endl;
+          break;
+        }
+      }
+      if (nthread >= MAX_NTHREAD) {
+        std::cout
+            << "Unable to conclude an optimal texture cacheline concurrency for dim "
+            << dim << std::endl;
+      };
+    }
+
+    // TODO: Use concurrency information to obtain the cache line size for
+    // textures as done in https://fburl.com/98xiou3g
+  }
+
  private:
   void _bandwidth(std::string memtype, uint32_t range) {
     auto memtype_lower = memtype;
@@ -689,6 +784,7 @@ int main(int argc, const char** argv) {
   app.shared_mem_bandwidth();
   app.warp_size();
   app.tex_bandwidth();
+  app.tex_cacheline_concurr();
 
   return 0;
 }