From 2ddafede2623dec2d555990eee9dc96d63823887 Mon Sep 17 00:00:00 2001
From: Esteban Padilla Cerdio <estebanpadilla@meta.com>
Date: Tue, 30 Jul 2024 11:52:27 -0700
Subject: [PATCH 1/4] Add 3D Texture Bandwidth metric (#4336)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4336

This diff introduces a profiler that obtains the maximum and minimum bandwidth for reading unique addresses from 3D textures in each of its dimensions, using the following shader, where A is a 3D texture and B is a writeonly buffer.

The calculation of the texel position will depend on the dimension that is being benchmarked

x : pos = ivec3(offset, 0, 0)
y : pos = ivec3(0, offset, 0)
z : pos = ivec3(0, 0, offset)

  void main() {
    vec4 sum = vec4(0);
    const uint workgroup_width = local_group_size * niter * ${NUNROLL};
    uint offset = (gl_WorkGroupID[0] * workgroup_width  + gl_LocalInvocationID[0]) & addr_mask;

    int i = 0;
    for (; i < niter; ++i)
    {
        sum *= texelFetch(A, pos, 0);
        offset = (offset + local_group_size) & addr_mask;
        ...
        ...
        sum *= texelFetch(A, pos, 0);
        offset = (offset + local_group_size) & addr_mask;
    }

    vec4 zero = vec4(i>>31);

    B[gl_LocalInvocationID[0]] = sum + zero;
  }

The address mask allows us to control how many unique addresses we are accessing. If the number of unique vectors we want to read is 3, the offset will jump between three unique addresses throughout the iterations, giving us the bandwidth for that specific size of data. If the size of the unique data read is larger than the work group size, then each run will have its own block of data to read, defined by the initial offset calculation, where the offset is obtained through the workgroup ID and the local invocation ID.

Finally, we make sure to use the `sum` and `i	` variables so that the compiler's optimizer does not flatten the loops.

For a Samsung S22, the bandwidth behaves like this for each of the dimensions.
{F1767497386}

Comparing the bandwidth for the X dimension to OpenCL, which was obtained through [ArchProbe](https://github.com/microsoft/ArchProbe), we can observe that, although the behavior is the same, Vulkan has an increased bandwidth for most access sizes.

{F1767497972}

Comparing to the bandwidth for buffers, we can observe that the bandwidth is similar to regular buffers, but still much smaller than UBOs at small access sizes.

 {F1767497707}

Reviewed By: jorgep31415

Differential Revision: D59980139
---
 .../tools/gpuinfo/glsl/tex_bandwidth.glsl     |  59 +++++++++
 .../tools/gpuinfo/glsl/tex_bandwidth.yaml     |  15 +++
 backends/vulkan/tools/gpuinfo/src/app.cpp     | 112 ++++++++++++++++++
 3 files changed, 186 insertions(+)
 create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
 create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml

diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
new file mode 100644
index 0000000000..d848fc0475
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_sampler(0, "r", "A", DTYPE)}
+${layout_declare_buffer(1, "w", "B", DTYPE, "PRECISION", False)}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int niter = 1;
+layout(constant_id = 4) const int nvec = 1;
+layout(constant_id = 5) const int local_group_size = 1;
+
+void main() {
+    // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
+    // This will help us limit address accessing to a specific set of unique
+    // addresses depending on the access size we want to measure.
+    const int addr_mask = nvec - 1;
+    vec4 sum = vec4(0);
+
+    // This is to distribute the accesses to unique addresses across the workgroups, once the
+    // size of the access excedes the workgroup width.
+    const uint workgroup_width = local_group_size * niter * ${NUNROLL};
+    uint offset = (gl_WorkGroupID[0] * workgroup_width  + gl_LocalInvocationID[0]) & addr_mask;
+
+    int i = 0;
+    for (; i < niter; ++i){
+      VEC4_T in_texel;
+      $for j in range(int(NUNROLL)):
+        $if DIM == 0:
+            in_texel = texelFetch(A, ivec3(offset, 0, 0), 0);
+        $elif DIM == 1:
+            in_texel = texelFetch(A, ivec3(0, offset, 0), 0);
+        $elif DIM == 2:
+            in_texel = texelFetch(A, ivec3(0, 0, offset), 0);
+
+        sum *= in_texel;
+
+        // On each unroll, a new unique address will be accessed through the offset,
+        // limited by the address mask to a specific set of unique addresses
+        offset = (offset + local_group_size) & addr_mask;
+    }
+
+    // This is to ensure no compiler optimizations occur
+    vec4 zero = vec4(i>>31);
+
+    B[gl_LocalInvocationID[0]] = sum + zero;
+}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml
new file mode 100644
index 0000000000..84da6938fd
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+tex_bandwidth:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NUNROLL: "16"
+  generate_variant_forall:
+    DIM:
+      - RANGE: [0, 2]
+  shader_variants:
+    - NAME: tex_bandwidth
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
index 8facdb5160..92eef84068 100644
--- a/backends/vulkan/tools/gpuinfo/src/app.cpp
+++ b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -22,6 +22,9 @@ class App {
   uint32_t sm_count_;
   uint32_t nthread_logic_;
   uint32_t subgroup_size_;
+  uint32_t max_tex_width_;
+  uint32_t max_tex_height_;
+  uint32_t max_tex_depth_;
 
  public:
   App() {
@@ -36,6 +39,9 @@ class App {
     nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
     buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
     max_shared_mem_size_ = cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
+    max_tex_width_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_WIDTH>();
+    max_tex_height_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_HEIGHT>();
+    max_tex_depth_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_DEPTH>();
 
     VkPhysicalDeviceSubgroupProperties subgroup_props{};
     VkPhysicalDeviceProperties2 props2{};
@@ -54,6 +60,9 @@ class App {
     std::cout << "Cache Size," << buf_cache_size_ << std::endl;
     std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl;
     std::cout << "SubGroup Size," << subgroup_size_ << std::endl;
+    std::cout << "MaxTexWidth," << max_tex_width_ << std::endl;
+    std::cout << "MaxTexHeight," << max_tex_height_ << std::endl;
+    std::cout << "MaxTexDepth," << max_tex_depth_ << std::endl;
   }
 
   void reg_count() {
@@ -308,6 +317,15 @@ class App {
               << std::endl;
   }
 
+  std::vector<int64_t> _whd_to_nchw(std::vector<int64_t> sizes) {
+    const int64_t W = sizes[0];
+    const int64_t H = sizes[1];
+    const int64_t D = sizes[2];
+
+    // Channels-packed: {W, H, D} = {W, H, (C / 4) * N}
+    return {1, D * 4, H, W};
+  }
+
  public:
   void buf_bandwidth() {
     std::cout << "\n------ Memory Bandwidth ------" << std::endl;
@@ -323,12 +341,105 @@ class App {
     const uint32_t RANGE = 128 * 1024 * 1024;
     _bandwidth("UBO", RANGE);
   }
+
   void shared_mem_bandwidth() {
     std::cout << "\n------ Shared Bandwidth ------" << std::endl;
     const uint32_t RANGE = max_shared_mem_size_;
     _bandwidth("Shared", RANGE);
   }
 
+  void tex_bandwidth() {
+    for (int dim = 0; dim < 3; dim++) {
+      std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
+                << std::endl;
+      const uint32_t MAX_SIZE = dim == 0 ? max_tex_width_
+          : dim == 1                     ? max_tex_height_
+                                         : max_tex_depth_;
+
+      // rgba, float
+      const uint32_t VEC_WIDTH = 4;
+      const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
+      const uint32_t NVEC = MAX_SIZE;
+
+      const uint32_t RANGE = NVEC * VEC_SIZE;
+
+      // Cache lines flushed
+      const uint32_t NFLUSH = 4;
+      // Number of loop unrolls. Changing this value requires an equal change in
+      // tex_bandwidth.yaml
+      const uint32_t NUNROLL = 16;
+      // Number of iterations. Increasing this value reduces noise in exchange
+      // for higher latency.
+      const uint32_t NITER = 10;
+      // Number of memory reads per thread
+      const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
+      // Number of threads needed to read all texells
+      const uint32_t NTHREAD = NVEC;
+      // Occupy all threads
+      const uint32_t local_x = nthread_logic_;
+      // Ensure that global is a multiple of local, and distribute across all
+      // SMs
+      const uint32_t global_x =
+          (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH;
+
+      auto shader_name = "tex_bandwidth_" + std::to_string(dim);
+
+      std::vector<int64_t> sizes_whd = {MAX_SIZE, 1, 1};
+      if (dim == 1) {
+        sizes_whd = {1, MAX_SIZE, 1};
+      } else if (dim == 2) {
+        sizes_whd = {1, 1, MAX_SIZE};
+      }
+      auto sizes_nchw = _whd_to_nchw(sizes_whd);
+
+      vTensor in_tensor =
+          api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
+
+      auto bench = [&](uint32_t access_size, uint32_t dim) {
+        // Number of texels that fit in this iteration
+        const uint32_t ntexel_access = access_size / VEC_SIZE;
+
+        StorageBuffer out_buf(
+            context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_);
+        vkapi::PipelineBarrier pipeline_barrier{};
+
+        auto time = benchmark_on_gpu(shader_name, 10, [&]() {
+          context()->submit_compute_job(
+              VK_KERNEL_FROM_STR(shader_name),
+              pipeline_barrier,
+              {global_x, 1, 1},
+              {local_x, 1, 1},
+              {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)},
+              VK_NULL_HANDLE,
+              0,
+              in_tensor.image(),
+              out_buf.buffer());
+        });
+
+        const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
+        double gbps = SIZE_TRANS * 1e-3 / time;
+        std::cout << "Texture bandwidth accessing \t" << access_size
+                  << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
+                  << "\tus)" << std::endl;
+        return gbps;
+      };
+
+      double max_bandwidth = 0;
+      double min_bandwidth = DBL_MAX;
+      for (uint32_t access_size = VEC_SIZE; access_size < RANGE;
+           access_size *= 2) {
+        double gbps = bench(access_size, dim);
+        max_bandwidth = std::max(gbps, max_bandwidth);
+        min_bandwidth = std::min(gbps, min_bandwidth);
+      }
+
+      std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth
+                << std::endl;
+      std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth
+                << std::endl;
+    }
+  }
+
   // Warp size is a difficult metric to obtain because the hardware limitations
   // do not always coincide with the way the SM divides the workload. For
   // instance, the hardware can have a warp size of 64 threads, but an SM might
@@ -492,6 +603,7 @@ int main(int argc, const char** argv) {
   app.ubo_bandwidth();
   app.shared_mem_bandwidth();
   app.warp_size();
+  app.tex_bandwidth();
 
   return 0;
 }

From 197ebc8390101b177ee66a829aa36e3abdee5aac Mon Sep 17 00:00:00 2001
From: Esteban Padilla Cerdio <estebanpadilla@meta.com>
Date: Tue, 30 Jul 2024 11:52:27 -0700
Subject: [PATCH 2/4] Add config file support for constants and test control
 (#4337)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4337

Now that the tool is getting larger, a configuration file for defining which tests to run and which to skip, as well as specifying some values like thresholds and ranges, comes in handy. This diff adds support for a JSON config file with specifications for each test.

Reviewed By: jorgep31415

Differential Revision: D60060188
---
 backends/vulkan/tools/gpuinfo/config.json |  43 ++++++
 backends/vulkan/tools/gpuinfo/src/app.cpp | 151 +++++++++++++++++-----
 2 files changed, 161 insertions(+), 33 deletions(-)
 create mode 100644 backends/vulkan/tools/gpuinfo/config.json

diff --git a/backends/vulkan/tools/gpuinfo/config.json b/backends/vulkan/tools/gpuinfo/config.json
new file mode 100644
index 0000000000..1efb9690fe
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/config.json
@@ -0,0 +1,43 @@
+{
+  "reg_count": {
+    "enabled": true,
+    "threshold": 3,
+    "compensate": 0.1
+  },
+  "buf_cacheline_size": {
+    "enabled": true,
+    "threshold": 10,
+    "compensate": 0.1
+  },
+  "buffer_bandwidth": {
+    "enabled": true,
+    "range": 134217728,
+    "nflush": 4,
+    "nunroll": 16,
+    "niter": 10
+  },
+  "ubo_bandwidth": {
+    "enabled": true,
+    "range": 134217728,
+    "nflush": 4,
+    "nunroll": 16,
+    "niter": 10
+  },
+  "shared_mem_bandwidth": {
+    "enabled": true,
+    "nflush": 4,
+    "nunroll": 16,
+    "niter": 10
+  },
+  "warp_size": {
+    "enabled": true,
+    "threshold": 3,
+    "compensate": 0.1
+  },
+  "tex_bandwidth": {
+    "enabled": true,
+    "nflush": 4,
+    "nunroll": 16,
+    "niter": 10
+  }
+}
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
index 92eef84068..c33e8a011d 100644
--- a/backends/vulkan/tools/gpuinfo/src/app.cpp
+++ b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/backends/vulkan/runtime/api/api.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
+#include <folly/json.h>
+#include <fstream>
 #include <iostream>
 
 #include "stats.h"
@@ -25,6 +27,46 @@ class App {
   uint32_t max_tex_width_;
   uint32_t max_tex_height_;
   uint32_t max_tex_depth_;
+  folly::dynamic config_;
+
+  std::vector<int64_t> _whd_to_nchw(std::vector<int64_t> sizes) {
+    const int64_t W = sizes[0];
+    const int64_t H = sizes[1];
+    const int64_t D = sizes[2];
+
+    // Channels-packed: {W, H, D} = {W, H, (C / 4) * N}
+    return {1, D * 4, H, W};
+  }
+
+  float _get_config(const std::string& test, const std::string& key) {
+    if (config_[test].empty()) {
+      throw std::runtime_error("Missing config for " + test);
+    }
+
+    if (!config_[test][key].isNumber()) {
+      throw std::runtime_error(
+          "Config for " + test + "." + key + " is not a number");
+    }
+
+    float value;
+    if (config_[test][key].isDouble()) {
+      value = config_[test][key].getDouble();
+    } else {
+      value = config_[test][key].getInt();
+    }
+
+    std::cout << "Read value for " << test << "." << key << " = " << value
+              << std::endl;
+    return value;
+  }
+
+  bool _enabled(const std::string& test) {
+    if (config_.empty() || config_[test].empty() ||
+        !config_[test]["enabled"].isBool()) {
+      return true;
+    }
+    return config_[test]["enabled"].getBool();
+  }
 
  public:
   App() {
@@ -65,16 +107,32 @@ class App {
     std::cout << "MaxTexDepth," << max_tex_depth_ << std::endl;
   }
 
+  void load_config(std::string file_path) {
+    std::ifstream file(file_path);
+    std::stringstream buffer;
+    buffer << file.rdbuf();
+    const std::string json_str = buffer.str();
+    if (json_str.empty()) {
+      throw std::runtime_error(
+          "Failed to read config file from " + file_path + ".");
+    }
+    config_ = folly::parseJson(json_str);
+  }
+
   void reg_count() {
+    if (!_enabled("reg_count")) {
+      std::cout << "Skipped Register Count" << std::endl;
+      return;
+    }
+
     std::cout << std::endl;
     std::cout << "------ Register Count ------" << std::endl;
     const uint32_t NREG_MIN = 1;
     const uint32_t NREG_MAX = 512;
     const uint32_t NREG_STEP = 1;
 
-    // TODO: Make these values configurable
-    const double COMPENSATE = 0.01;
-    const double THRESHOLD = 3;
+    const double COMPENSATE = _get_config("reg_count", "compensate");
+    const double THRESHOLD = _get_config("reg_count", "threshold");
 
     const uint32_t NGRP_MIN = 1;
     const uint32_t NGRP_MAX = 64;
@@ -175,12 +233,16 @@ class App {
   }
 
   void buf_cacheline_size() {
+    if (!_enabled("buf_cacheline_size")) {
+      std::cout << "Skipped Buffer Cacheline Size" << std::endl;
+      return;
+    }
+
     std::cout << std::endl;
     std::cout << "------ Buffer Cacheline Size ------" << std::endl;
 
-    // TODO: Make these values configurable
-    const double COMPENSATE = 0.01;
-    const double THRESHOLD = 10;
+    const double COMPENSATE = _get_config("buf_cacheline_size", "compensate");
+    const double THRESHOLD = _get_config("buf_cacheline_size", "threshold");
 
     const uint32_t PITCH = buf_cache_size_ / nthread_logic_;
     const uint32_t BUF_SIZE = buf_cache_size_;
@@ -237,15 +299,23 @@ class App {
 
  private:
   void _bandwidth(std::string memtype, uint32_t range) {
-    // TODO: Make these values configurable
+    auto memtype_lower = memtype;
+    std::transform(
+        memtype_lower.begin(),
+        memtype_lower.end(),
+        memtype_lower.begin(),
+        [](unsigned char c) { return std::tolower(c); });
+
+    auto test_name = memtype_lower + "_bandwidth";
+
     // Cache lines flushed
-    const uint32_t NFLUSH = 4;
+    const uint32_t NFLUSH = _get_config(test_name, "nflush");
     // Number of loop unrolls. Changing this value requires an equal change in
     // buf_bandwidth.yaml
-    const uint32_t NUNROLL = 16;
+    const uint32_t NUNROLL = _get_config(test_name, "nunroll");
     // Number of iterations. Increasing this value reduces noise in exchange for
     // higher latency.
-    const uint32_t NITER = 10;
+    const uint32_t NITER = _get_config(test_name, "niter");
     // Vector dimensions (vec4)
     const uint32_t VEC_WIDTH = 4;
     const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
@@ -273,12 +343,6 @@ class App {
           context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_);
       vkapi::PipelineBarrier pipeline_barrier{};
 
-      auto memtype_lower = memtype;
-      std::transform(
-          memtype_lower.begin(),
-          memtype_lower.end(),
-          memtype_lower.begin(),
-          [](unsigned char c) { return std::tolower(c); });
       auto shader_name = "buf_bandwidth_" + memtype_lower;
 
       auto time = benchmark_on_gpu(shader_name, 10, [&]() {
@@ -317,38 +381,49 @@ class App {
               << std::endl;
   }
 
-  std::vector<int64_t> _whd_to_nchw(std::vector<int64_t> sizes) {
-    const int64_t W = sizes[0];
-    const int64_t H = sizes[1];
-    const int64_t D = sizes[2];
-
-    // Channels-packed: {W, H, D} = {W, H, (C / 4) * N}
-    return {1, D * 4, H, W};
-  }
-
  public:
   void buf_bandwidth() {
+    if (!_enabled("buffer_bandwidth")) {
+      std::cout << "Skipped Memory Bandwidth" << std::endl;
+      return;
+    }
+
     std::cout << "\n------ Memory Bandwidth ------" << std::endl;
     // Maximum memory space read - 128MB
     // For regular devices, bandwidth plateaus at less memory than this, so more
     // is not needed.
-    const uint32_t RANGE = 128 * 1024 * 1024;
+    const uint32_t RANGE = _get_config("buffer_bandwidth", "range");
     _bandwidth("Buffer", RANGE);
   }
 
   void ubo_bandwidth() {
+    if (!_enabled("ubo_bandwidth")) {
+      std::cout << "Skipped UBO Bandwidth" << std::endl;
+      return;
+    }
+
     std::cout << "\n------ UBO Bandwidth ------" << std::endl;
-    const uint32_t RANGE = 128 * 1024 * 1024;
+    const uint32_t RANGE = _get_config("ubo_bandwidth", "range");
     _bandwidth("UBO", RANGE);
   }
 
   void shared_mem_bandwidth() {
+    if (!_enabled("shared_mem_bandwidth")) {
+      std::cout << "Skipped Shared Memory Bandwidth" << std::endl;
+      return;
+    }
+
     std::cout << "\n------ Shared Bandwidth ------" << std::endl;
     const uint32_t RANGE = max_shared_mem_size_;
     _bandwidth("Shared", RANGE);
   }
 
   void tex_bandwidth() {
+    if (!_enabled("tex_bandwidth")) {
+      std::cout << "Skipped Texture Bandwidth" << std::endl;
+      return;
+    }
+
     for (int dim = 0; dim < 3; dim++) {
       std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
                 << std::endl;
@@ -364,13 +439,13 @@ class App {
       const uint32_t RANGE = NVEC * VEC_SIZE;
 
       // Cache lines flushed
-      const uint32_t NFLUSH = 4;
+      const uint32_t NFLUSH = _get_config("tex_bandwidth", "nflush");
       // Number of loop unrolls. Changing this value requires an equal change in
       // tex_bandwidth.yaml
-      const uint32_t NUNROLL = 16;
+      const uint32_t NUNROLL = _get_config("tex_bandwidth", "nunroll");
       // Number of iterations. Increasing this value reduces noise in exchange
       // for higher latency.
-      const uint32_t NITER = 10;
+      const uint32_t NITER = _get_config("tex_bandwidth", "niter");
       // Number of memory reads per thread
       const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
       // Number of threads needed to read all texells
@@ -458,6 +533,11 @@ class App {
   // In Case 2, like in Adreno, the driver might decide to pack multiple works
   // together and dispatch them at once.
   void warp_size(bool verbose = false) {
+    if (!_enabled("warp_size")) {
+      std::cout << "Skipped Warp Size" << std::endl;
+      return;
+    }
+
     std::cout << "\n------ Warp Size ------" << std::endl;
 
     // Method A: Stress test with a kernel that uses complex ALU operations like
@@ -467,8 +547,8 @@ class App {
     // This timing-based method helps us identify physical warp sizes. It also
     // helps with Case 2, when threads of multiple warps are managed by the same
     // scheduler at the same time.
-    const double COMPENSATE = 0.01;
-    const double THRESHOLD = 3;
+    const double COMPENSATE = _get_config("warp_size", "compensate");
+    const double THRESHOLD = _get_config("warp_size", "threshold");
 
     uint32_t NITER;
 
@@ -596,7 +676,12 @@ class App {
 int main(int argc, const char** argv) {
   App app;
 
-  // TODO: Allow user to skip tests
+  std::string file_path = "config.json";
+  if (argc > 1) {
+    file_path = argv[1];
+  };
+  app.load_config(file_path);
+
   app.reg_count();
   app.buf_cacheline_size();
   app.buf_bandwidth();

From ff282c5688ea8dbf9152d409eeb1151728a6162f Mon Sep 17 00:00:00 2001
From: Esteban Padilla Cerdio <estebanpadilla@meta.com>
Date: Tue, 30 Jul 2024 11:52:27 -0700
Subject: [PATCH 3/4] Add metric for 3D texture max concurrent cache read
 (#4421)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4421

This diff introduces a metric to calculate the maximum concurrent cache line accesses for each dimension of a 3D texture. The experiment works by allowing each thread to access a different texel on the texture and slowly increasing the number of threads, until the cache line is no longer able to handle all simultaneous accesses. By detecting a jump in latency, we can define the optimal maximum size that can be accessed concurrently on each dimension.

NOTE: ArchProbe uses this information to[ obtain a supposed cache line size for textures](https://fburl.com/98xiou3g). However, it is unclear why they define the cache line size as being the ratio between the larger concurrency value over the lower, times the texel size. It is also unclear how to extend their calculations to three dimensions.

TODO: Understand the relationship between concurrency and cache line size, and modify this metric to output the cache line size.

For a Samsung S22, the latency graph looks like this:

 {F1780375117}

Reviewed By: copyrightly

Differential Revision: D60246121
---
 backends/vulkan/tools/gpuinfo/config.json     |  5 +
 .../gpuinfo/glsl/tex_cacheline_concurr.glsl   | 39 ++++++++
 .../gpuinfo/glsl/tex_cacheline_concurr.yaml   | 14 +++
 backends/vulkan/tools/gpuinfo/src/app.cpp     | 98 ++++++++++++++++++-
 4 files changed, 155 insertions(+), 1 deletion(-)
 create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl
 create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml

diff --git a/backends/vulkan/tools/gpuinfo/config.json b/backends/vulkan/tools/gpuinfo/config.json
index 1efb9690fe..7307f29503 100644
--- a/backends/vulkan/tools/gpuinfo/config.json
+++ b/backends/vulkan/tools/gpuinfo/config.json
@@ -39,5 +39,10 @@
     "nflush": 4,
     "nunroll": 16,
     "niter": 10
+  },
+  "tex_cacheline_concurr": {
+    "enabled": true,
+    "threshold": 3,
+    "compensate": 0.1
   }
 }
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl
new file mode 100644
index 0000000000..62659c7bb8
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_sampler(0, "r", "in_tex", DTYPE)}
+${layout_declare_buffer(1, "w", "out_buf", DTYPE, "PRECISION", False)}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int niter = 1;
+
+void main() {
+    vec4 sum = vec4(0);
+    int i = 0;
+    for (; i < niter; ++i){
+        $if DIM == 0:
+            sum += texelFetch(in_tex, ivec3(gl_GlobalInvocationID[0], 0, 0), 0);
+        $elif DIM == 1:
+            sum +=  texelFetch(in_tex, ivec3(0, gl_GlobalInvocationID[0], 0), 0);
+        $elif DIM == 2:
+            sum +=  texelFetch(in_tex, ivec3(0, 0, gl_GlobalInvocationID[0]), 0);
+    }
+
+    // This is to ensure no compiler optimizations occur
+    vec4 zero = vec4(i>>31);
+
+    out_buf[0] = sum + zero;
+}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml
new file mode 100644
index 0000000000..6b557c9f66
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+tex_cacheline_concurr:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DIM:
+      - RANGE: [0, 2]
+  shader_variants:
+    - NAME: tex_cacheline_concurr
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
index c33e8a011d..2b1621db62 100644
--- a/backends/vulkan/tools/gpuinfo/src/app.cpp
+++ b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -291,12 +291,107 @@ class App {
     if (stride >= MAX_STRIDE) {
       std::cout << "Unable to conclude a top level buffer cacheline size."
                 << std::endl;
-      cacheline_size = MAX_STRIDE;
+      cacheline_size = MAX_STRIDE * sizeof(float);
     }
 
     std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
   }
 
+  // Textures are drastically different from buffers in terms of data layout.
+  // While buffers are a contiguous range of memory, textures are opaque objects
+  // defined by the vendor and it is possible that nearby points of data are not
+  // neighboring in memory. Likewise, data points are accessed in
+  // multi-dimensional patches instead of simple lines. This makes the stride
+  // method for figuring out the cache line size not applicable. To go around
+  // this, this experiment runs an increasing amount of threads accessing
+  // different datapoints in the texture and measures latency. If the cache line
+  // is big enough to contain all requested data for the amount of threads,
+  // latency will be low. When there are more threads and hence more data than
+  // what a single cache line can handle, a second line must be fetched,
+  // increasing latency in a measurable way.
+  void tex_cacheline_concurr() {
+    if (!_enabled("tex_cacheline_concurr")) {
+      std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl;
+      return;
+    }
+
+    const uint32_t TEXEL_WIDTH = 4;
+    const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH;
+
+    const double COMPENSATE =
+        _get_config("tex_cacheline_concurr", "compensate");
+    const double THRESHOLD = _get_config("tex_cacheline_concurr", "threshold");
+
+    for (int dim = 0; dim < 3; ++dim) {
+      std::cout << std::endl;
+      std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim
+                << ") ------" << std::endl;
+
+      uint32_t NITER;
+
+      const uint32_t IMG_OTHER_EDGE = dim == 0 ? max_tex_width_
+          : dim == 1                           ? max_tex_height_
+                                               : max_tex_depth_;
+
+      const uint32_t MAX_NTHREAD = std::min(nthread_logic_, IMG_OTHER_EDGE);
+
+      auto bench = [&](uint32_t nthread) {
+        std::vector<int64_t> sizes_whd = {
+            max_tex_width_, max_tex_height_, max_tex_depth_};
+
+        auto sizes_nchw = _whd_to_nchw(sizes_whd);
+
+        vTensor in_tensor =
+            api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
+
+        StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
+
+        vkapi::PipelineBarrier pipeline_barrier{};
+
+        auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim);
+
+        auto time = benchmark_on_gpu(shader_name, 100, [&]() {
+          context()->submit_compute_job(
+              VK_KERNEL_FROM_STR(shader_name),
+              pipeline_barrier,
+              {nthread, 1, 1},
+              {nthread, 1, 1},
+              {SV(NITER)},
+              VK_NULL_HANDLE,
+              0,
+              in_tensor.image(),
+              out_buf.buffer());
+        });
+        return time;
+      };
+
+      ensure_min_niter(1000, NITER, [&]() { return bench(1); });
+
+      DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+      uint32_t nthread = 1;
+      for (; nthread <= MAX_NTHREAD; ++nthread) {
+        double time = bench(nthread);
+        std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time
+                  << std::endl;
+
+        if (dj.push(time)) {
+          auto max_concurrency = nthread - 1;
+          std::cout << "TextureCachelineConcurrencyDim" << dim << " (B),"
+                    << max_concurrency * TEXEL_SIZE << std::endl;
+          break;
+        }
+      }
+      if (nthread >= MAX_NTHREAD) {
+        std::cout
+            << "Unable to conclude an optimal texture cacheline concurrency for dim "
+            << dim << std::endl;
+      };
+    }
+
+    // TODO: Use concurrency information to obtain the cache line size for
+    // textures as done in https://fburl.com/98xiou3g
+  }
+
  private:
   void _bandwidth(std::string memtype, uint32_t range) {
     auto memtype_lower = memtype;
@@ -689,6 +784,7 @@ int main(int argc, const char** argv) {
   app.shared_mem_bandwidth();
   app.warp_size();
   app.tex_bandwidth();
+  app.tex_cacheline_concurr();
 
   return 0;
 }

From ba7f3ab6b7c695c08c933cc9b3d3d689a8ef0c2a Mon Sep 17 00:00:00 2001
From: Esteban Padilla Cerdio <estebanpadilla@meta.com>
Date: Tue, 30 Jul 2024 11:56:23 -0700
Subject: [PATCH 4/4] Refactor and class split (#4432)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4432

Big classes are scary ☹️

This diff subdivides the tests into categories, places them as functions inside the gpuinfo namespace, instead of as part of the App class, and the App class is now only for persisting device information and configuration.

Reviewed By: jorgep31415

Differential Revision: D60290882
---
 backends/vulkan/tools/gpuinfo/config.json     |   2 +-
 backends/vulkan/tools/gpuinfo/include/app.h   | 114 +++
 .../tools/gpuinfo/include/architecture.h      | 285 +++++++
 .../vulkan/tools/gpuinfo/include/buffers.h    | 203 +++++
 .../vulkan/tools/gpuinfo/include/textures.h   | 207 +++++
 backends/vulkan/tools/gpuinfo/include/utils.h |   9 +
 backends/vulkan/tools/gpuinfo/src/app.cpp     | 790 ------------------
 backends/vulkan/tools/gpuinfo/src/main.cpp    |  40 +
 8 files changed, 859 insertions(+), 791 deletions(-)
 create mode 100644 backends/vulkan/tools/gpuinfo/include/app.h
 create mode 100644 backends/vulkan/tools/gpuinfo/include/architecture.h
 create mode 100644 backends/vulkan/tools/gpuinfo/include/buffers.h
 create mode 100644 backends/vulkan/tools/gpuinfo/include/textures.h
 delete mode 100644 backends/vulkan/tools/gpuinfo/src/app.cpp
 create mode 100644 backends/vulkan/tools/gpuinfo/src/main.cpp

diff --git a/backends/vulkan/tools/gpuinfo/config.json b/backends/vulkan/tools/gpuinfo/config.json
index 7307f29503..afb5cbc6c5 100644
--- a/backends/vulkan/tools/gpuinfo/config.json
+++ b/backends/vulkan/tools/gpuinfo/config.json
@@ -23,7 +23,7 @@
     "nunroll": 16,
     "niter": 10
   },
-  "shared_mem_bandwidth": {
+  "shared_bandwidth": {
     "enabled": true,
     "nflush": 4,
     "nunroll": 16,
diff --git a/backends/vulkan/tools/gpuinfo/include/app.h b/backends/vulkan/tools/gpuinfo/include/app.h
new file mode 100644
index 0000000000..a46e9e6b9a
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/include/app.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+#include <folly/json.h>
+#include <fstream>
+#include <iostream>
+
+#include "utils.h"
+
+namespace gpuinfo {
+
+class App {
+ private:
+  folly::dynamic config_;
+
+ public:
+  size_t buf_cache_size;
+  uint32_t max_shared_mem_size;
+  uint32_t sm_count;
+  uint32_t nthread_logic;
+  uint32_t subgroup_size;
+  uint32_t max_tex_width;
+  uint32_t max_tex_height;
+  uint32_t max_tex_depth;
+
+  App() {
+    context()->initialize_querypool();
+
+    std::cout << context()->adapter_ptr()->stringize() << std::endl
+              << std::endl;
+
+    auto cl_device = get_cl_device();
+
+    sm_count = cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+    nthread_logic = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+    buf_cache_size = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
+    max_shared_mem_size = cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
+    max_tex_width = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_WIDTH>();
+    max_tex_height = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_HEIGHT>();
+    max_tex_depth = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_DEPTH>();
+
+    VkPhysicalDeviceSubgroupProperties subgroup_props{};
+    VkPhysicalDeviceProperties2 props2{};
+
+    props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
+    props2.pNext = &subgroup_props;
+    subgroup_props.sType =
+        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
+    vkGetPhysicalDeviceProperties2(
+        context()->adapter_ptr()->physical_handle(), &props2);
+    subgroup_size = subgroup_props.subgroupSize;
+
+    std::cout << std::endl;
+    std::cout << "SM count," << sm_count << std::endl;
+    std::cout << "Logic Thread Count," << nthread_logic << std::endl;
+    std::cout << "Cache Size," << buf_cache_size << std::endl;
+    std::cout << "Shared Memory Size," << max_shared_mem_size << std::endl;
+    std::cout << "SubGroup Size," << subgroup_size << std::endl;
+    std::cout << "MaxTexWidth," << max_tex_width << std::endl;
+    std::cout << "MaxTexHeight," << max_tex_height << std::endl;
+    std::cout << "MaxTexDepth," << max_tex_depth << std::endl;
+  }
+
+  float get_config(const std::string& test, const std::string& key) const {
+    if (config_[test].empty()) {
+      throw std::runtime_error("Missing config for " + test);
+    }
+
+    if (!config_[test][key].isNumber()) {
+      throw std::runtime_error(
+          "Config for " + test + "." + key + " is not a number");
+    }
+
+    float value;
+    if (config_[test][key].isDouble()) {
+      value = config_[test][key].getDouble();
+    } else {
+      value = config_[test][key].getInt();
+    }
+
+    std::cout << "Read value for " << test << "." << key << " = " << value
+              << std::endl;
+    return value;
+  }
+
+  bool enabled(const std::string& test) const {
+    if (config_.empty() || config_[test].empty() ||
+        !config_[test]["enabled"].isBool()) {
+      return true;
+    }
+    return config_[test]["enabled"].getBool();
+  }
+
+  void load_config(std::string file_path) {
+    std::ifstream file(file_path);
+    std::stringstream buffer;
+    buffer << file.rdbuf();
+    const std::string json_str = buffer.str();
+    if (json_str.empty()) {
+      throw std::runtime_error(
+          "Failed to read config file from " + file_path + ".");
+    }
+    config_ = folly::parseJson(json_str);
+  }
+};
+} // namespace gpuinfo
diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h
new file mode 100644
index 0000000000..0d312ee87c
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/include/architecture.h
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
+
+#include "app.h"
+#include "stats.h"
+#include "utils.h"
+
+using namespace vkapi;
+
+namespace gpuinfo {
+
+void reg_count(const App& app) {
+  if (!app.enabled("reg_count")) {
+    std::cout << "Skipped Register Count" << std::endl;
+    return;
+  }
+
+  std::cout << std::endl;
+  std::cout << "------ Register Count ------" << std::endl;
+  const uint32_t NREG_MIN = 1;
+  const uint32_t NREG_MAX = 512;
+  const uint32_t NREG_STEP = 1;
+
+  const double COMPENSATE = app.get_config("reg_count", "compensate");
+  const double THRESHOLD = app.get_config("reg_count", "threshold");
+
+  const uint32_t NGRP_MIN = 1;
+  const uint32_t NGRP_MAX = 64;
+  const uint32_t NGRP_STEP = 1;
+
+  uint32_t NITER;
+
+  auto bench = [&](uint32_t ngrp, uint32_t nreg) {
+    StorageBuffer buffer(context(), vkapi::kFloat, 1);
+    vkapi::PipelineBarrier pipeline_barrier{};
+
+    auto shader_name = "reg_count_" + std::to_string(nreg);
+
+    auto time = benchmark_on_gpu(shader_name, 30, [&]() {
+      context()->submit_compute_job(
+          VK_KERNEL_FROM_STR(shader_name),
+          pipeline_barrier,
+          {1, ngrp, 1},
+          {1, 1, 1},
+          {SV(NITER)},
+          VK_NULL_HANDLE,
+          0,
+          buffer.buffer());
+    });
+    return time;
+  };
+
+  ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
+
+  uint32_t nreg_max;
+
+  DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+  uint32_t nreg = NREG_MIN;
+  for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
+    double time = bench(1, nreg);
+    std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << "\tus"
+              << std::endl;
+    if (dj.push(time)) {
+      nreg -= NREG_STEP;
+      nreg_max = nreg;
+      break;
+    }
+  }
+  if (nreg >= NREG_MAX) {
+    std::cout << "Unable to conclude a maximal register count" << std::endl;
+    nreg_max = NREG_STEP;
+  } else {
+    std::cout << nreg_max << " registers are available at most" << std::endl;
+  }
+
+  auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
+    DtJumpFinder<3> dj(COMPENSATE, THRESHOLD);
+    for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
+      auto time = bench(ngrp, nreg);
+      std::cout << "Testing occupation (nreg=\t" << nreg << "\t); ngrp=\t"
+                << ngrp << "\t, time=\t" << time << "\tus" << std::endl;
+
+      if (dj.push(time)) {
+        ngrp -= NGRP_STEP;
+        std::cout << "Using " << nreg << " registers can have " << ngrp
+                  << " concurrent single-thread workgroups" << std::endl;
+        return ngrp;
+      }
+    }
+    std::cout
+        << "Unable to conclude a maximum number of concurrent single-thread workgroups when "
+        << nreg << " registers are occupied" << std::endl;
+    return (uint32_t)1;
+  };
+
+  uint32_t ngrp_full, ngrp_half;
+  ngrp_full = find_ngrp_by_nreg(nreg_max);
+  ngrp_half = find_ngrp_by_nreg(nreg_max / 2);
+
+  std::string reg_ty;
+
+  if (ngrp_full * 1.5 < ngrp_half) {
+    std::cout << "All physical threads in an sm share " << nreg_max
+              << " registers" << std::endl;
+    reg_ty = "Pooled";
+
+  } else {
+    std::cout << "Each physical thread has " << nreg_max << " registers"
+              << std::endl;
+    reg_ty = "Dedicated";
+  }
+
+  std::cout << std::endl << std::endl;
+  std::cout << "MaxRegisters," << nreg_max << std::endl;
+  std::cout << "ConcurrentWorkgroupsFullReg," << ngrp_full << std::endl;
+  std::cout << "ConcurrentWorkgroupsHalfReg," << ngrp_half << std::endl;
+  std::cout << "RegisterType," << reg_ty << std::endl;
+}
+
+// Warp size is a difficult metric to obtain because the hardware limitations
+// do not always coincide with the way the SM divides the workload. For
+// instance, the hardware can have a warp size of 64 threads, but an SM might
+// be able to simulate concurrency of 128 threads with a single scheduler.
+
+// Because of this, it is important to measure the warp size different ways,
+// that can evidence both the physical limitations of the hardware, and the
+// actual behavior of the driver.
+
+// Additionally,the SM can behave in two different ways when the assigned
+// workload is smaller than the warp size.
+
+// In Case 1, like ARM Mali, the SM can assign dummy workloads to fill empty
+// threads and maintain a uniform workload.
+
+// In Case 2, like in Adreno, the driver might decide to pack multiple works
+// together and dispatch them at once.
+void warp_size(const App& app, const bool verbose = false) {
+  if (!app.enabled("warp_size")) {
+    std::cout << "Skipped Warp Size" << std::endl;
+    return;
+  }
+
+  std::cout << "\n------ Warp Size ------" << std::endl;
+
+  // Method A: Stress test with a kernel that uses complex ALU operations like
+  // integer division to avoid latency hiding. Increase the number of threads
+  // until a jump in latency is detected.
+
+  // This timing-based method helps us identify physical warp sizes. It also
+  // helps with Case 2, when threads of multiple warps are managed by the same
+  // scheduler at the same time.
+  const double COMPENSATE = app.get_config("warp_size", "compensate");
+  const double THRESHOLD = app.get_config("warp_size", "threshold");
+
+  uint32_t NITER;
+
+  auto bench = [&](uint32_t nthread) {
+    StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
+    vkapi::PipelineBarrier pipeline_barrier{};
+
+    auto shader_name = "warp_size_physical";
+
+    auto time = benchmark_on_gpu(shader_name, 10, [&]() {
+      context()->submit_compute_job(
+          VK_KERNEL_FROM_STR(shader_name),
+          pipeline_barrier,
+          // Large number of work groups selected to potentially saturate all
+          // ALUs and thus have a better baseline for comparison.
+          {nthread, 1024, 1},
+          {nthread, 1, 1},
+          {SV(NITER)},
+          VK_NULL_HANDLE,
+          0,
+          out_buf.buffer());
+    });
+
+    return time;
+  };
+
+  ensure_min_niter(1000, NITER, [&]() { return bench(1); });
+
+  uint32_t warp_size = app.subgroup_size;
+  DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+
+  // We increase the number of threads until we hit a jump in the data.
+  uint32_t nthread = 1;
+  for (; nthread <= app.nthread_logic; ++nthread) {
+    double time = bench(nthread);
+    std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)"
+              << std::endl;
+    if (dj.push(time)) {
+      warp_size = nthread - 1;
+      break;
+    }
+  }
+  if (nthread >= app.nthread_logic) {
+    std::cout
+        << "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size"
+        << std::endl;
+  }
+
+  // Method B: Let all the threads in a warp race and atomically fetch-add
+  // a counter, then store the counter values to the output buffer in the
+  // scheduling order of these threads. If all the order numbers follow an
+  // ascending order, then the threads are likely executing within a warp.
+  // Threads in different warps are not managed by the same scheduler, so they
+  // would race for a same ID out of order, unaware of each other.
+
+  // This method evidences the actual driver behavior when running
+  // concurrency, regardless of the physical limitations of the hardware.
+
+  // Likewise, this method helps us identify warp sizes when the SM
+  // sub-divides its ALUs into independent groups, like the three execution
+  // engines in a Mali G76 core. It helps warp-probing in Case 1 because it
+  // doesn't depend on kernel timing, so the extra wait time doesn't lead to
+  // inaccuracy.
+  auto bench_sm = [&](uint32_t nthread) {
+    StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
+    vkapi::PipelineBarrier pipeline_barrier{};
+
+    auto shader_name = "warp_size_scheduler";
+
+    benchmark_on_gpu(shader_name, 1, [&]() {
+      context()->submit_compute_job(
+          VK_KERNEL_FROM_STR(shader_name),
+          pipeline_barrier,
+          {nthread, 1, 1},
+          {nthread, 1, 1},
+          {},
+          VK_NULL_HANDLE,
+          0,
+          out_buf.buffer());
+    });
+
+    std::vector<int32_t> data(app.nthread_logic);
+    copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes());
+
+    if (verbose) {
+      std::stringstream ss;
+      for (auto j = 0; j < nthread; ++j) {
+        ss << data[j] << " ";
+      }
+      std::cout << ss.str() << std::endl;
+    }
+
+    // Check until which point is the data in ascending order.
+    int32_t last = -1;
+    int32_t j = 0;
+    for (; j < nthread; ++j) {
+      if (last >= data[j]) {
+        break;
+      }
+      last = data[j];
+    }
+
+    return j;
+  };
+
+  // Test increasing sizes until the data is no longer in ascending order.
+  uint32_t warp_size_scheduler = warp_size;
+  int i = 1;
+  for (; i <= app.nthread_logic; ++i) {
+    uint32_t nascend = bench_sm(i);
+    if (nascend != i) {
+      warp_size_scheduler = nascend;
+      break;
+    }
+  }
+  if (i > app.nthread_logic) {
+    std::cout << "Unable to conclude an SM Warp Size." << std::endl;
+  }
+
+  std::cout << "PhysicalWarpSize," << warp_size << std::endl;
+  std::cout << "SMWarpSize," << warp_size_scheduler << std::endl;
+}
+}; // namespace gpuinfo
diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h
new file mode 100644
index 0000000000..8cb0da49ca
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/include/buffers.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include "app.h"
+#include "stats.h"
+#include "utils.h"
+
+using namespace vkapi;
+
+namespace gpuinfo {
+
+void buf_cacheline_size(const App& app) {
+  if (!app.enabled("buf_cacheline_size")) {
+    std::cout << "Skipped Buffer Cacheline Size" << std::endl;
+    return;
+  }
+
+  std::cout << std::endl;
+  std::cout << "------ Buffer Cacheline Size ------" << std::endl;
+
+  const double COMPENSATE = app.get_config("buf_cacheline_size", "compensate");
+  const double THRESHOLD = app.get_config("buf_cacheline_size", "threshold");
+
+  const uint32_t PITCH = app.buf_cache_size / app.nthread_logic;
+  const uint32_t BUF_SIZE = app.buf_cache_size;
+  const uint32_t MAX_STRIDE = PITCH;
+
+  uint32_t NITER;
+
+  auto bench = [&](int stride) {
+    StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
+    StorageBuffer out_buf(context(), vkapi::kFloat, 1);
+    vkapi::PipelineBarrier pipeline_barrier{};
+
+    auto shader_name = "buf_cacheline_size";
+
+    auto time = benchmark_on_gpu(shader_name, 100, [&]() {
+      context()->submit_compute_job(
+          VK_KERNEL_FROM_STR(shader_name),
+          pipeline_barrier,
+          {app.nthread_logic, 1, 1},
+          {app.nthread_logic, 1, 1},
+          {SV(NITER), SV(stride), SV(PITCH)},
+          VK_NULL_HANDLE,
+          0,
+          in_buf.buffer(),
+          out_buf.buffer());
+    });
+    return time;
+  };
+
+  ensure_min_niter(1000, NITER, [&]() { return bench(1); });
+
+  uint32_t cacheline_size;
+
+  DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+  uint32_t stride = 1;
+  for (; stride <= MAX_STRIDE; ++stride) {
+    double time = bench(stride);
+    std::cout << "Testing stride=\t" << stride << "\t, time=\t" << time
+              << std::endl;
+
+    if (dj.push(time)) {
+      cacheline_size = stride * sizeof(float);
+      break;
+    }
+  }
+  if (stride >= MAX_STRIDE) {
+    std::cout << "Unable to conclude a top level buffer cacheline size."
+              << std::endl;
+    cacheline_size = MAX_STRIDE * sizeof(float);
+  }
+
+  std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
+}
+
+void _bandwidth(
+    const App& app,
+    const std::string memtype,
+    const uint32_t range) {
+  auto memtype_lower = memtype;
+  std::transform(
+      memtype_lower.begin(),
+      memtype_lower.end(),
+      memtype_lower.begin(),
+      [](unsigned char c) { return std::tolower(c); });
+
+  auto test_name = memtype_lower + "_bandwidth";
+
+  // Cache lines flushed
+  const uint32_t NFLUSH = app.get_config(test_name, "nflush");
+  // Number of loop unrolls. Changing this value requires an equal change in
+  // buf_bandwidth.yaml
+  const uint32_t NUNROLL = app.get_config(test_name, "nunroll");
+  // Number of iterations. Increasing this value reduces noise in exchange for
+  // higher latency.
+  const uint32_t NITER = app.get_config(test_name, "niter");
+  // Vector dimensions (vec4)
+  const uint32_t VEC_WIDTH = 4;
+  const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
+  // Number of vectors that fit in the selected memory space
+  const uint32_t NVEC = range / VEC_SIZE;
+  // Number of memory reads per thread
+  const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
+  // Number of threads needed to read al l vectors
+  // The thread count doesn't divide by thread workload in shared memory
+  // because of the limited memory size.
+  const uint32_t NTHREAD = memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD;
+  // Occupy all threads
+  const uint32_t local_x = app.nthread_logic;
+  // Ensure that global is a multiple of local, and distribute across all SMs
+  const uint32_t global_x =
+      (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH;
+
+  auto bench = [&](uint32_t access_size) {
+    // Number of vectors that fit in this iteration
+    const uint32_t nvec_access = access_size / VEC_SIZE;
+
+    StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
+    StorageBuffer out_buf(
+        context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
+    vkapi::PipelineBarrier pipeline_barrier{};
+
+    auto shader_name = "buf_bandwidth_" + memtype_lower;
+
+    auto time = benchmark_on_gpu(shader_name, 10, [&]() {
+      context()->submit_compute_job(
+          VK_KERNEL_FROM_STR(shader_name),
+          pipeline_barrier,
+          {global_x, 1, 1},
+          {local_x, 1, 1},
+          {SV(NITER), SV(nvec_access), SV(local_x)},
+          VK_NULL_HANDLE,
+          0,
+          in_buf.buffer(),
+          out_buf.buffer());
+    });
+
+    const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
+    auto gbps = SIZE_TRANS * 1e-3 / time;
+    std::cout << memtype << " bandwidth accessing \t" << access_size
+              << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
+              << "\tus)" << std::endl;
+    return gbps;
+  };
+
+  double max_bandwidth = 0;
+  double min_bandwidth = DBL_MAX;
+  for (uint32_t access_size = VEC_SIZE; access_size < range; access_size *= 2) {
+    double gbps = bench(access_size);
+    max_bandwidth = std::max(gbps, max_bandwidth);
+    min_bandwidth = std::min(gbps, min_bandwidth);
+  }
+
+  std::cout << "Max" << memtype << "Bandwidth (GB/s)," << max_bandwidth
+            << std::endl;
+  std::cout << "Min" << memtype << "Bandwidth (GB/s)," << min_bandwidth
+            << std::endl;
+}
+
+void buf_bandwidth(const App& app) {
+  if (!app.enabled("buffer_bandwidth")) {
+    std::cout << "Skipped Memory Bandwidth" << std::endl;
+    return;
+  }
+
+  std::cout << "\n------ Memory Bandwidth ------" << std::endl;
+  // Maximum memory space read - 128MB
+  // For regular devices, bandwidth plateaus at less memory than this, so more
+  // is not needed.
+  const uint32_t RANGE = app.get_config("buffer_bandwidth", "range");
+  _bandwidth(app, "Buffer", RANGE);
+}
+
+void ubo_bandwidth(const App& app) {
+  if (!app.enabled("ubo_bandwidth")) {
+    std::cout << "Skipped UBO Bandwidth" << std::endl;
+    return;
+  }
+
+  std::cout << "\n------ UBO Bandwidth ------" << std::endl;
+  const uint32_t RANGE = app.get_config("ubo_bandwidth", "range");
+  _bandwidth(app, "UBO", RANGE);
+}
+
+void shared_mem_bandwidth(const App& app) {
+  if (!app.enabled("shared_bandwidth")) {
+    std::cout << "Skipped Shared Memory Bandwidth" << std::endl;
+    return;
+  }
+
+  std::cout << "\n------ Shared Bandwidth ------" << std::endl;
+  const uint32_t RANGE = app.max_shared_mem_size;
+  _bandwidth(app, "Shared", RANGE);
+}
+} // namespace gpuinfo
diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h
new file mode 100644
index 0000000000..bb8a3371a9
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/include/textures.h
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include "app.h"
+#include "stats.h"
+#include "utils.h"
+
+namespace gpuinfo {
+
+// Textures are drastically different from buffers in terms of data layout.
+// While buffers are a contiguous range of memory, textures are opaque objects
+// defined by the vendor and it is possible that nearby points of data are not
+// neighboring in memory. Likewise, data points are accessed in
+// multi-dimensional patches instead of simple lines. This makes the stride
+// method for figuring out the cache line size not applicable. To go around
+// this, this experiment runs an increasing amount of threads accessing
+// different datapoints in the texture and measures latency. If the cache line
+// is big enough to contain all requested data for the amount of threads,
+// latency will be low. When there are more threads and hence more data than
+// what a single cache line can handle, a second line must be fetched,
+// increasing latency in a measurable way.
+void tex_cacheline_concurr(const App& app) {
+  if (!app.enabled("tex_cacheline_concurr")) {
+    std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl;
+    return;
+  }
+
+  const uint32_t TEXEL_WIDTH = 4;
+  const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH;
+
+  const double COMPENSATE =
+      app.get_config("tex_cacheline_concurr", "compensate");
+  const double THRESHOLD = app.get_config("tex_cacheline_concurr", "threshold");
+
+  for (int dim = 0; dim < 3; ++dim) {
+    std::cout << std::endl;
+    std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim
+              << ") ------" << std::endl;
+
+    uint32_t NITER;
+
+    const uint32_t IMG_OTHER_EDGE = dim == 0 ? app.max_tex_width
+        : dim == 1                           ? app.max_tex_height
+                                             : app.max_tex_depth;
+
+    const uint32_t MAX_NTHREAD = std::min(app.nthread_logic, IMG_OTHER_EDGE);
+
+    auto bench = [&](uint32_t nthread) {
+      std::vector<int64_t> sizes_whd = {
+          app.max_tex_width, app.max_tex_height, app.max_tex_depth};
+
+      auto sizes_nchw = whd_to_nchw(sizes_whd);
+
+      vTensor in_tensor =
+          api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
+
+      StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
+
+      vkapi::PipelineBarrier pipeline_barrier{};
+
+      auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim);
+
+      auto time = benchmark_on_gpu(shader_name, 100, [&]() {
+        context()->submit_compute_job(
+            VK_KERNEL_FROM_STR(shader_name),
+            pipeline_barrier,
+            {nthread, 1, 1},
+            {nthread, 1, 1},
+            {SV(NITER)},
+            VK_NULL_HANDLE,
+            0,
+            in_tensor.image(),
+            out_buf.buffer());
+      });
+      return time;
+    };
+
+    ensure_min_niter(1000, NITER, [&]() { return bench(1); });
+
+    DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+    uint32_t nthread = 1;
+    for (; nthread <= MAX_NTHREAD; ++nthread) {
+      double time = bench(nthread);
+      std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time
+                << std::endl;
+
+      if (dj.push(time)) {
+        auto max_concurrency = nthread - 1;
+        std::cout << "TextureCachelineConcurrencyDim" << dim << " (B),"
+                  << max_concurrency * TEXEL_SIZE << std::endl;
+        break;
+      }
+    }
+    if (nthread >= MAX_NTHREAD) {
+      std::cout
+          << "Unable to conclude an optimal texture cacheline concurrency for dim "
+          << dim << std::endl;
+    };
+  }
+
+  // TODO: Use concurrency information to obtain the cache line size for
+  // textures as done in https://fburl.com/98xiou3g
+}
+
+void tex_bandwidth(const App& app) {
+  if (!app.enabled("tex_bandwidth")) {
+    std::cout << "Skipped Texture Bandwidth" << std::endl;
+    return;
+  }
+
+  for (int dim = 0; dim < 3; dim++) {
+    std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
+              << std::endl;
+    const uint32_t MAX_SIZE = dim == 0 ? app.max_tex_width
+        : dim == 1                     ? app.max_tex_height
+                                       : app.max_tex_depth;
+
+    // rgba, float
+    const uint32_t VEC_WIDTH = 4;
+    const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
+    const uint32_t NVEC = MAX_SIZE;
+
+    const uint32_t RANGE = NVEC * VEC_SIZE;
+
+    // Cache lines flushed
+    const uint32_t NFLUSH = app.get_config("tex_bandwidth", "nflush");
+    // Number of loop unrolls. Changing this value requires an equal change in
+    // tex_bandwidth.yaml
+    const uint32_t NUNROLL = app.get_config("tex_bandwidth", "nunroll");
+    // Number of iterations. Increasing this value reduces noise in exchange
+    // for higher latency.
+    const uint32_t NITER = app.get_config("tex_bandwidth", "niter");
+    // Number of memory reads per thread
+    const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
+    // Number of threads needed to read all texells
+    const uint32_t NTHREAD = NVEC;
+    // Occupy all threads
+    const uint32_t local_x = app.nthread_logic;
+    // Ensure that global is a multiple of local, and distribute across all
+    // SMs
+    const uint32_t global_x =
+        (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH;
+
+    auto shader_name = "tex_bandwidth_" + std::to_string(dim);
+
+    std::vector<int64_t> sizes_whd = {MAX_SIZE, 1, 1};
+    if (dim == 1) {
+      sizes_whd = {1, MAX_SIZE, 1};
+    } else if (dim == 2) {
+      sizes_whd = {1, 1, MAX_SIZE};
+    }
+    auto sizes_nchw = whd_to_nchw(sizes_whd);
+
+    vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
+
+    auto bench = [&](uint32_t access_size, uint32_t dim) {
+      // Number of texels that fit in this iteration
+      const uint32_t ntexel_access = access_size / VEC_SIZE;
+
+      StorageBuffer out_buf(
+          context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
+      vkapi::PipelineBarrier pipeline_barrier{};
+
+      auto time = benchmark_on_gpu(shader_name, 10, [&]() {
+        context()->submit_compute_job(
+            VK_KERNEL_FROM_STR(shader_name),
+            pipeline_barrier,
+            {global_x, 1, 1},
+            {local_x, 1, 1},
+            {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)},
+            VK_NULL_HANDLE,
+            0,
+            in_tensor.image(),
+            out_buf.buffer());
+      });
+
+      const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
+      double gbps = SIZE_TRANS * 1e-3 / time;
+      std::cout << "Texture bandwidth accessing \t" << access_size
+                << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
+                << "\tus)" << std::endl;
+      return gbps;
+    };
+
+    double max_bandwidth = 0;
+    double min_bandwidth = DBL_MAX;
+    for (uint32_t access_size = VEC_SIZE; access_size < RANGE;
+         access_size *= 2) {
+      double gbps = bench(access_size, dim);
+      max_bandwidth = std::max(gbps, max_bandwidth);
+      min_bandwidth = std::min(gbps, min_bandwidth);
+    }
+
+    std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth
+              << std::endl;
+    std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth
+              << std::endl;
+  }
+}
+} // namespace gpuinfo
diff --git a/backends/vulkan/tools/gpuinfo/include/utils.h b/backends/vulkan/tools/gpuinfo/include/utils.h
index 231fb32c5a..887cb443ef 100644
--- a/backends/vulkan/tools/gpuinfo/include/utils.h
+++ b/backends/vulkan/tools/gpuinfo/include/utils.h
@@ -54,6 +54,15 @@ void ensure_min_niter(
   }
 }
 
+std::vector<int64_t> whd_to_nchw(std::vector<int64_t> sizes) {
+  const int64_t W = sizes[0];
+  const int64_t H = sizes[1];
+  const int64_t D = sizes[2];
+
+  // Channels-packed: {W, H, D} = {W, H, (C / 4) * N}
+  return {1, D * 4, H, W};
+}
+
 cl_platform_id get_cl_platform_id() {
   cl_uint nplatform_id;
   clGetPlatformIDs(0, nullptr, &nplatform_id);
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
deleted file mode 100644
index 2b1621db62..0000000000
--- a/backends/vulkan/tools/gpuinfo/src/app.cpp
+++ /dev/null
@@ -1,790 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
-#include <folly/json.h>
-#include <fstream>
-#include <iostream>
-
-#include "stats.h"
-#include "utils.h"
-
-using namespace vkapi;
-
-class App {
- private:
-  size_t buf_cache_size_;
-  uint32_t max_shared_mem_size_;
-  uint32_t sm_count_;
-  uint32_t nthread_logic_;
-  uint32_t subgroup_size_;
-  uint32_t max_tex_width_;
-  uint32_t max_tex_height_;
-  uint32_t max_tex_depth_;
-  folly::dynamic config_;
-
-  std::vector<int64_t> _whd_to_nchw(std::vector<int64_t> sizes) {
-    const int64_t W = sizes[0];
-    const int64_t H = sizes[1];
-    const int64_t D = sizes[2];
-
-    // Channels-packed: {W, H, D} = {W, H, (C / 4) * N}
-    return {1, D * 4, H, W};
-  }
-
-  float _get_config(const std::string& test, const std::string& key) {
-    if (config_[test].empty()) {
-      throw std::runtime_error("Missing config for " + test);
-    }
-
-    if (!config_[test][key].isNumber()) {
-      throw std::runtime_error(
-          "Config for " + test + "." + key + " is not a number");
-    }
-
-    float value;
-    if (config_[test][key].isDouble()) {
-      value = config_[test][key].getDouble();
-    } else {
-      value = config_[test][key].getInt();
-    }
-
-    std::cout << "Read value for " << test << "." << key << " = " << value
-              << std::endl;
-    return value;
-  }
-
-  bool _enabled(const std::string& test) {
-    if (config_.empty() || config_[test].empty() ||
-        !config_[test]["enabled"].isBool()) {
-      return true;
-    }
-    return config_[test]["enabled"].getBool();
-  }
-
- public:
-  App() {
-    context()->initialize_querypool();
-
-    std::cout << context()->adapter_ptr()->stringize() << std::endl
-              << std::endl;
-
-    auto cl_device = get_cl_device();
-
-    sm_count_ = cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
-    nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
-    buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
-    max_shared_mem_size_ = cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
-    max_tex_width_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_WIDTH>();
-    max_tex_height_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_HEIGHT>();
-    max_tex_depth_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_DEPTH>();
-
-    VkPhysicalDeviceSubgroupProperties subgroup_props{};
-    VkPhysicalDeviceProperties2 props2{};
-
-    props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
-    props2.pNext = &subgroup_props;
-    subgroup_props.sType =
-        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
-    vkGetPhysicalDeviceProperties2(
-        context()->adapter_ptr()->physical_handle(), &props2);
-    subgroup_size_ = subgroup_props.subgroupSize;
-
-    std::cout << std::endl;
-    std::cout << "SM count," << sm_count_ << std::endl;
-    std::cout << "Logic Thread Count," << nthread_logic_ << std::endl;
-    std::cout << "Cache Size," << buf_cache_size_ << std::endl;
-    std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl;
-    std::cout << "SubGroup Size," << subgroup_size_ << std::endl;
-    std::cout << "MaxTexWidth," << max_tex_width_ << std::endl;
-    std::cout << "MaxTexHeight," << max_tex_height_ << std::endl;
-    std::cout << "MaxTexDepth," << max_tex_depth_ << std::endl;
-  }
-
-  void load_config(std::string file_path) {
-    std::ifstream file(file_path);
-    std::stringstream buffer;
-    buffer << file.rdbuf();
-    const std::string json_str = buffer.str();
-    if (json_str.empty()) {
-      throw std::runtime_error(
-          "Failed to read config file from " + file_path + ".");
-    }
-    config_ = folly::parseJson(json_str);
-  }
-
-  void reg_count() {
-    if (!_enabled("reg_count")) {
-      std::cout << "Skipped Register Count" << std::endl;
-      return;
-    }
-
-    std::cout << std::endl;
-    std::cout << "------ Register Count ------" << std::endl;
-    const uint32_t NREG_MIN = 1;
-    const uint32_t NREG_MAX = 512;
-    const uint32_t NREG_STEP = 1;
-
-    const double COMPENSATE = _get_config("reg_count", "compensate");
-    const double THRESHOLD = _get_config("reg_count", "threshold");
-
-    const uint32_t NGRP_MIN = 1;
-    const uint32_t NGRP_MAX = 64;
-    const uint32_t NGRP_STEP = 1;
-
-    uint32_t NITER;
-
-    auto bench = [&](uint32_t ngrp, uint32_t nreg) {
-      StorageBuffer buffer(context(), vkapi::kFloat, 1);
-      vkapi::PipelineBarrier pipeline_barrier{};
-
-      auto shader_name = "reg_count_" + std::to_string(nreg);
-
-      auto time = benchmark_on_gpu(shader_name, 100, [&]() {
-        context()->submit_compute_job(
-            VK_KERNEL_FROM_STR(shader_name),
-            pipeline_barrier,
-            {1, ngrp, 1},
-            {1, 1, 1},
-            {SV(NITER)},
-            VK_NULL_HANDLE,
-            0,
-            buffer.buffer());
-      });
-      return time;
-    };
-
-    std::cout << "Calculating NITER..." << std::endl;
-    ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
-    std::cout << "NITER," << NITER << std::endl;
-
-    uint32_t nreg_max;
-
-    DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-    uint32_t nreg = NREG_MIN;
-    for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
-      double time = bench(1, nreg);
-      std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time
-                << std::endl;
-      if (dj.push(time)) {
-        nreg -= NREG_STEP;
-        nreg_max = nreg;
-        break;
-      }
-    }
-    if (nreg >= NREG_MAX) {
-      std::cout << "Unable to conclude a maximal register count" << std::endl;
-      nreg_max = NREG_STEP;
-    } else {
-      std::cout << nreg_max << " registers are available at most" << std::endl;
-    }
-
-    auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
-      DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-      for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
-        auto time = bench(ngrp, nreg);
-        std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp
-                  << ", time=" << time << " us" << std::endl;
-
-        if (dj.push(time)) {
-          ngrp -= NGRP_STEP;
-          std::cout << "Using " << nreg << " registers can have " << ngrp
-                    << " concurrent single-thread workgroups" << std::endl;
-          return ngrp;
-        }
-      }
-      std::cout
-          << "Unable to conclude a maximum number of concurrent single-thread workgroups when "
-          << nreg << " registers are occupied" << std::endl;
-      return (uint32_t)1;
-    };
-
-    uint32_t ngrp_full, ngrp_half;
-    ngrp_full = find_ngrp_by_nreg(nreg_max);
-    ngrp_half = find_ngrp_by_nreg(nreg_max / 2);
-
-    std::string reg_ty;
-
-    if (ngrp_full * 1.5 < ngrp_half) {
-      std::cout << "All physical threads in an sm share " << nreg_max
-                << " registers" << std::endl;
-      reg_ty = "Pooled";
-
-    } else {
-      std::cout << "Each physical thread has " << nreg_max << " registers"
-                << std::endl;
-      reg_ty = "Dedicated";
-    }
-
-    std::cout << std::endl << std::endl;
-    std::cout << "NITER," << NITER << std::endl;
-    std::cout << "Max registers," << nreg_max << std::endl;
-    std::cout << "Concurrent full single thread workgroups," << ngrp_full
-              << std::endl;
-    std::cout << "Concurrent half single thread workgroups," << ngrp_half
-              << std::endl;
-    std::cout << "Register type," << reg_ty << std::endl;
-  }
-
-  void buf_cacheline_size() {
-    if (!_enabled("buf_cacheline_size")) {
-      std::cout << "Skipped Buffer Cacheline Size" << std::endl;
-      return;
-    }
-
-    std::cout << std::endl;
-    std::cout << "------ Buffer Cacheline Size ------" << std::endl;
-
-    const double COMPENSATE = _get_config("buf_cacheline_size", "compensate");
-    const double THRESHOLD = _get_config("buf_cacheline_size", "threshold");
-
-    const uint32_t PITCH = buf_cache_size_ / nthread_logic_;
-    const uint32_t BUF_SIZE = buf_cache_size_;
-    const uint32_t MAX_STRIDE = PITCH;
-
-    uint32_t NITER;
-
-    auto bench = [&](int stride) {
-      StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
-      StorageBuffer out_buf(context(), vkapi::kFloat, 1);
-      vkapi::PipelineBarrier pipeline_barrier{};
-
-      auto shader_name = "buf_cacheline_size";
-
-      auto time = benchmark_on_gpu(shader_name, 100, [&]() {
-        context()->submit_compute_job(
-            VK_KERNEL_FROM_STR(shader_name),
-            pipeline_barrier,
-            {nthread_logic_, 1, 1},
-            {nthread_logic_, 1, 1},
-            {SV(NITER), SV(stride), SV(PITCH)},
-            VK_NULL_HANDLE,
-            0,
-            in_buf.buffer(),
-            out_buf.buffer());
-      });
-      return time;
-    };
-
-    ensure_min_niter(1000, NITER, [&]() { return bench(1); });
-
-    uint32_t cacheline_size;
-
-    DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-    uint32_t stride = 1;
-    for (; stride <= MAX_STRIDE; ++stride) {
-      double time = bench(stride);
-      std::cout << "Testing stride=\t" << stride << "\t, time=\t" << time
-                << std::endl;
-
-      if (dj.push(time)) {
-        cacheline_size = stride * sizeof(float);
-        break;
-      }
-    }
-    if (stride >= MAX_STRIDE) {
-      std::cout << "Unable to conclude a top level buffer cacheline size."
-                << std::endl;
-      cacheline_size = MAX_STRIDE * sizeof(float);
-    }
-
-    std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
-  }
-
-  // Textures are drastically different from buffers in terms of data layout.
-  // While buffers are a contiguous range of memory, textures are opaque objects
-  // defined by the vendor and it is possible that nearby points of data are not
-  // neighboring in memory. Likewise, data points are accessed in
-  // multi-dimensional patches instead of simple lines. This makes the stride
-  // method for figuring out the cache line size not applicable. To go around
-  // this, this experiment runs an increasing amount of threads accessing
-  // different datapoints in the texture and measures latency. If the cache line
-  // is big enough to contain all requested data for the amount of threads,
-  // latency will be low. When there are more threads and hence more data than
-  // what a single cache line can handle, a second line must be fetched,
-  // increasing latency in a measurable way.
-  void tex_cacheline_concurr() {
-    if (!_enabled("tex_cacheline_concurr")) {
-      std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl;
-      return;
-    }
-
-    const uint32_t TEXEL_WIDTH = 4;
-    const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH;
-
-    const double COMPENSATE =
-        _get_config("tex_cacheline_concurr", "compensate");
-    const double THRESHOLD = _get_config("tex_cacheline_concurr", "threshold");
-
-    for (int dim = 0; dim < 3; ++dim) {
-      std::cout << std::endl;
-      std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim
-                << ") ------" << std::endl;
-
-      uint32_t NITER;
-
-      const uint32_t IMG_OTHER_EDGE = dim == 0 ? max_tex_width_
-          : dim == 1                           ? max_tex_height_
-                                               : max_tex_depth_;
-
-      const uint32_t MAX_NTHREAD = std::min(nthread_logic_, IMG_OTHER_EDGE);
-
-      auto bench = [&](uint32_t nthread) {
-        std::vector<int64_t> sizes_whd = {
-            max_tex_width_, max_tex_height_, max_tex_depth_};
-
-        auto sizes_nchw = _whd_to_nchw(sizes_whd);
-
-        vTensor in_tensor =
-            api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
-
-        StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
-
-        vkapi::PipelineBarrier pipeline_barrier{};
-
-        auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim);
-
-        auto time = benchmark_on_gpu(shader_name, 100, [&]() {
-          context()->submit_compute_job(
-              VK_KERNEL_FROM_STR(shader_name),
-              pipeline_barrier,
-              {nthread, 1, 1},
-              {nthread, 1, 1},
-              {SV(NITER)},
-              VK_NULL_HANDLE,
-              0,
-              in_tensor.image(),
-              out_buf.buffer());
-        });
-        return time;
-      };
-
-      ensure_min_niter(1000, NITER, [&]() { return bench(1); });
-
-      DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-      uint32_t nthread = 1;
-      for (; nthread <= MAX_NTHREAD; ++nthread) {
-        double time = bench(nthread);
-        std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time
-                  << std::endl;
-
-        if (dj.push(time)) {
-          auto max_concurrency = nthread - 1;
-          std::cout << "TextureCachelineConcurrencyDim" << dim << " (B),"
-                    << max_concurrency * TEXEL_SIZE << std::endl;
-          break;
-        }
-      }
-      if (nthread >= MAX_NTHREAD) {
-        std::cout
-            << "Unable to conclude an optimal texture cacheline concurrency for dim "
-            << dim << std::endl;
-      };
-    }
-
-    // TODO: Use concurrency information to obtain the cache line size for
-    // textures as done in https://fburl.com/98xiou3g
-  }
-
- private:
-  void _bandwidth(std::string memtype, uint32_t range) {
-    auto memtype_lower = memtype;
-    std::transform(
-        memtype_lower.begin(),
-        memtype_lower.end(),
-        memtype_lower.begin(),
-        [](unsigned char c) { return std::tolower(c); });
-
-    auto test_name = memtype_lower + "_bandwidth";
-
-    // Cache lines flushed
-    const uint32_t NFLUSH = _get_config(test_name, "nflush");
-    // Number of loop unrolls. Changing this value requires an equal change in
-    // buf_bandwidth.yaml
-    const uint32_t NUNROLL = _get_config(test_name, "nunroll");
-    // Number of iterations. Increasing this value reduces noise in exchange for
-    // higher latency.
-    const uint32_t NITER = _get_config(test_name, "niter");
-    // Vector dimensions (vec4)
-    const uint32_t VEC_WIDTH = 4;
-    const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
-    // Number of vectors that fit in the selected memory space
-    const uint32_t NVEC = range / VEC_SIZE;
-    // Number of memory reads per thread
-    const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
-    // Number of threads needed to read al l vectors
-    // The thread count doesn't divide by thread workload in shared memory
-    // because of the limited memory size.
-    const uint32_t NTHREAD =
-        memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD;
-    // Occupy all threads
-    const uint32_t local_x = nthread_logic_;
-    // Ensure that global is a multiple of local, and distribute across all SMs
-    const uint32_t global_x =
-        (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH;
-
-    auto bench = [&](uint32_t access_size) {
-      // Number of vectors that fit in this iteration
-      const uint32_t nvec_access = access_size / VEC_SIZE;
-
-      StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
-      StorageBuffer out_buf(
-          context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_);
-      vkapi::PipelineBarrier pipeline_barrier{};
-
-      auto shader_name = "buf_bandwidth_" + memtype_lower;
-
-      auto time = benchmark_on_gpu(shader_name, 10, [&]() {
-        context()->submit_compute_job(
-            VK_KERNEL_FROM_STR(shader_name),
-            pipeline_barrier,
-            {global_x, 1, 1},
-            {local_x, 1, 1},
-            {SV(NITER), SV(nvec_access), SV(local_x)},
-            VK_NULL_HANDLE,
-            0,
-            in_buf.buffer(),
-            out_buf.buffer());
-      });
-
-      const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
-      auto gbps = SIZE_TRANS * 1e-3 / time;
-      std::cout << memtype << " bandwidth accessing \t" << access_size
-                << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
-                << "\tus)" << std::endl;
-      return gbps;
-    };
-
-    double max_bandwidth = 0;
-    double min_bandwidth = DBL_MAX;
-    for (uint32_t access_size = VEC_SIZE; access_size < range;
-         access_size *= 2) {
-      double gbps = bench(access_size);
-      max_bandwidth = std::max(gbps, max_bandwidth);
-      min_bandwidth = std::min(gbps, min_bandwidth);
-    }
-
-    std::cout << "Max" << memtype << "Bandwidth (GB/s)," << max_bandwidth
-              << std::endl;
-    std::cout << "Min" << memtype << "Bandwidth (GB/s)," << min_bandwidth
-              << std::endl;
-  }
-
- public:
-  void buf_bandwidth() {
-    if (!_enabled("buffer_bandwidth")) {
-      std::cout << "Skipped Memory Bandwidth" << std::endl;
-      return;
-    }
-
-    std::cout << "\n------ Memory Bandwidth ------" << std::endl;
-    // Maximum memory space read - 128MB
-    // For regular devices, bandwidth plateaus at less memory than this, so more
-    // is not needed.
-    const uint32_t RANGE = _get_config("buffer_bandwidth", "range");
-    _bandwidth("Buffer", RANGE);
-  }
-
-  void ubo_bandwidth() {
-    if (!_enabled("ubo_bandwidth")) {
-      std::cout << "Skipped UBO Bandwidth" << std::endl;
-      return;
-    }
-
-    std::cout << "\n------ UBO Bandwidth ------" << std::endl;
-    const uint32_t RANGE = _get_config("ubo_bandwidth", "range");
-    _bandwidth("UBO", RANGE);
-  }
-
-  void shared_mem_bandwidth() {
-    if (!_enabled("shared_mem_bandwidth")) {
-      std::cout << "Skipped Shared Memory Bandwidth" << std::endl;
-      return;
-    }
-
-    std::cout << "\n------ Shared Bandwidth ------" << std::endl;
-    const uint32_t RANGE = max_shared_mem_size_;
-    _bandwidth("Shared", RANGE);
-  }
-
-  void tex_bandwidth() {
-    if (!_enabled("tex_bandwidth")) {
-      std::cout << "Skipped Texture Bandwidth" << std::endl;
-      return;
-    }
-
-    for (int dim = 0; dim < 3; dim++) {
-      std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
-                << std::endl;
-      const uint32_t MAX_SIZE = dim == 0 ? max_tex_width_
-          : dim == 1                     ? max_tex_height_
-                                         : max_tex_depth_;
-
-      // rgba, float
-      const uint32_t VEC_WIDTH = 4;
-      const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
-      const uint32_t NVEC = MAX_SIZE;
-
-      const uint32_t RANGE = NVEC * VEC_SIZE;
-
-      // Cache lines flushed
-      const uint32_t NFLUSH = _get_config("tex_bandwidth", "nflush");
-      // Number of loop unrolls. Changing this value requires an equal change in
-      // tex_bandwidth.yaml
-      const uint32_t NUNROLL = _get_config("tex_bandwidth", "nunroll");
-      // Number of iterations. Increasing this value reduces noise in exchange
-      // for higher latency.
-      const uint32_t NITER = _get_config("tex_bandwidth", "niter");
-      // Number of memory reads per thread
-      const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
-      // Number of threads needed to read all texells
-      const uint32_t NTHREAD = NVEC;
-      // Occupy all threads
-      const uint32_t local_x = nthread_logic_;
-      // Ensure that global is a multiple of local, and distribute across all
-      // SMs
-      const uint32_t global_x =
-          (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH;
-
-      auto shader_name = "tex_bandwidth_" + std::to_string(dim);
-
-      std::vector<int64_t> sizes_whd = {MAX_SIZE, 1, 1};
-      if (dim == 1) {
-        sizes_whd = {1, MAX_SIZE, 1};
-      } else if (dim == 2) {
-        sizes_whd = {1, 1, MAX_SIZE};
-      }
-      auto sizes_nchw = _whd_to_nchw(sizes_whd);
-
-      vTensor in_tensor =
-          api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
-
-      auto bench = [&](uint32_t access_size, uint32_t dim) {
-        // Number of texels that fit in this iteration
-        const uint32_t ntexel_access = access_size / VEC_SIZE;
-
-        StorageBuffer out_buf(
-            context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_);
-        vkapi::PipelineBarrier pipeline_barrier{};
-
-        auto time = benchmark_on_gpu(shader_name, 10, [&]() {
-          context()->submit_compute_job(
-              VK_KERNEL_FROM_STR(shader_name),
-              pipeline_barrier,
-              {global_x, 1, 1},
-              {local_x, 1, 1},
-              {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)},
-              VK_NULL_HANDLE,
-              0,
-              in_tensor.image(),
-              out_buf.buffer());
-        });
-
-        const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
-        double gbps = SIZE_TRANS * 1e-3 / time;
-        std::cout << "Texture bandwidth accessing \t" << access_size
-                  << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
-                  << "\tus)" << std::endl;
-        return gbps;
-      };
-
-      double max_bandwidth = 0;
-      double min_bandwidth = DBL_MAX;
-      for (uint32_t access_size = VEC_SIZE; access_size < RANGE;
-           access_size *= 2) {
-        double gbps = bench(access_size, dim);
-        max_bandwidth = std::max(gbps, max_bandwidth);
-        min_bandwidth = std::min(gbps, min_bandwidth);
-      }
-
-      std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth
-                << std::endl;
-      std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth
-                << std::endl;
-    }
-  }
-
-  // Warp size is a difficult metric to obtain because the hardware limitations
-  // do not always coincide with the way the SM divides the workload. For
-  // instance, the hardware can have a warp size of 64 threads, but an SM might
-  // be able to simulate concurrency of 128 threads with a single scheduler.
-
-  // Because of this, it is important to measure the warp size different ways,
-  // that can evidence both the physical limitations of the hardware, and the
-  // actual behavior of the driver.
-
-  // Additionally,the SM can behave in two different ways when the assigned
-  // workload is smaller than the warp size.
-
-  // In Case 1, like ARM Mali, the SM can assign dummy workloads to fill empty
-  // threads and maintain a uniform workload.
-
-  // In Case 2, like in Adreno, the driver might decide to pack multiple works
-  // together and dispatch them at once.
-  void warp_size(bool verbose = false) {
-    if (!_enabled("warp_size")) {
-      std::cout << "Skipped Warp Size" << std::endl;
-      return;
-    }
-
-    std::cout << "\n------ Warp Size ------" << std::endl;
-
-    // Method A: Stress test with a kernel that uses complex ALU operations like
-    // integer division to avoid latency hiding. Increase the number of threads
-    // until a jump in latency is detected.
-
-    // This timing-based method helps us identify physical warp sizes. It also
-    // helps with Case 2, when threads of multiple warps are managed by the same
-    // scheduler at the same time.
-    const double COMPENSATE = _get_config("warp_size", "compensate");
-    const double THRESHOLD = _get_config("warp_size", "threshold");
-
-    uint32_t NITER;
-
-    auto bench = [&](uint32_t nthread) {
-      StorageBuffer out_buf(context(), vkapi::kInt, nthread_logic_);
-      vkapi::PipelineBarrier pipeline_barrier{};
-
-      auto shader_name = "warp_size_physical";
-
-      auto time = benchmark_on_gpu(shader_name, 10, [&]() {
-        context()->submit_compute_job(
-            VK_KERNEL_FROM_STR(shader_name),
-            pipeline_barrier,
-            // Large number of work groups selected to potentially saturate all
-            // ALUs and thus have a better baseline for comparison.
-            {nthread, 1024, 1},
-            {nthread, 1, 1},
-            {SV(NITER)},
-            VK_NULL_HANDLE,
-            0,
-            out_buf.buffer());
-      });
-
-      return time;
-    };
-
-    ensure_min_niter(1000, NITER, [&]() { return bench(1); });
-
-    uint32_t warp_size = subgroup_size_;
-    DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-
-    // We increase the number of threads until we hit a jump in the data.
-    uint32_t nthread = 1;
-    for (; nthread <= nthread_logic_; ++nthread) {
-      double time = bench(nthread);
-      std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)"
-                << std::endl;
-      if (dj.push(time)) {
-        warp_size = nthread - 1;
-        break;
-      }
-    }
-    if (nthread >= nthread_logic_) {
-      std::cout
-          << "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size"
-          << std::endl;
-    }
-
-    // Method B: Let all the threads in a warp race and atomically fetch-add
-    // a counter, then store the counter values to the output buffer in the
-    // scheduling order of these threads. If all the order numbers follow an
-    // ascending order, then the threads are likely executing within a warp.
-    // Threads in different warps are not managed by the same scheduler, so they
-    // would race for a same ID out of order, unaware of each other.
-
-    // This method evidences the actual driver behavior when running
-    // concurrency, regardless of the physical limitations of the hardware.
-
-    // Likewise, this method helps us identify warp sizes when the SM
-    // sub-divides its ALUs into independent groups, like the three execution
-    // engines in a Mali G76 core. It helps warp-probing in Case 1 because it
-    // doesn't depend on kernel timing, so the extra wait time doesn't lead to
-    // inaccuracy.
-    auto bench_sm = [&](uint32_t nthread) {
-      StorageBuffer out_buf(context(), vkapi::kInt, nthread_logic_);
-      vkapi::PipelineBarrier pipeline_barrier{};
-
-      auto shader_name = "warp_size_scheduler";
-
-      benchmark_on_gpu(shader_name, 1, [&]() {
-        context()->submit_compute_job(
-            VK_KERNEL_FROM_STR(shader_name),
-            pipeline_barrier,
-            {nthread, 1, 1},
-            {nthread, 1, 1},
-            {},
-            VK_NULL_HANDLE,
-            0,
-            out_buf.buffer());
-      });
-
-      std::vector<int32_t> data(nthread_logic_);
-      copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes());
-
-      if (verbose) {
-        std::stringstream ss;
-        for (auto j = 0; j < nthread; ++j) {
-          ss << data[j] << " ";
-        }
-        std::cout << ss.str() << std::endl;
-      }
-
-      // Check until which point is the data in ascending order.
-      int32_t last = -1;
-      int32_t j = 0;
-      for (; j < nthread; ++j) {
-        if (last >= data[j]) {
-          break;
-        }
-        last = data[j];
-      }
-
-      return j;
-    };
-
-    // Test increasing sizes until the data is no longer in ascending order.
-    uint32_t warp_size_scheduler = warp_size;
-    int i = 1;
-    for (; i <= nthread_logic_; ++i) {
-      uint32_t nascend = bench_sm(i);
-      if (nascend != i) {
-        warp_size_scheduler = nascend;
-        break;
-      }
-    }
-    if (i > nthread_logic_) {
-      std::cout << "Unable to conclude an SM Warp Size." << std::endl;
-    }
-
-    std::cout << "PhysicalWarpSize," << warp_size << std::endl;
-    std::cout << "SMWarpSize," << warp_size_scheduler << std::endl;
-  }
-};
-
-int main(int argc, const char** argv) {
-  App app;
-
-  std::string file_path = "config.json";
-  if (argc > 1) {
-    file_path = argv[1];
-  };
-  app.load_config(file_path);
-
-  app.reg_count();
-  app.buf_cacheline_size();
-  app.buf_bandwidth();
-  app.ubo_bandwidth();
-  app.shared_mem_bandwidth();
-  app.warp_size();
-  app.tex_bandwidth();
-  app.tex_cacheline_concurr();
-
-  return 0;
-}
diff --git a/backends/vulkan/tools/gpuinfo/src/main.cpp b/backends/vulkan/tools/gpuinfo/src/main.cpp
new file mode 100644
index 0000000000..f0e29aaf1a
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/src/main.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "app.h"
+#include "architecture.h"
+#include "buffers.h"
+#include "textures.h"
+
+using namespace vkapi;
+
+int main(int argc, const char** argv) {
+  gpuinfo::App app;
+
+  std::string file_path = "config.json";
+  if (argc > 1) {
+    file_path = argv[1];
+  };
+  app.load_config(file_path);
+
+  // Architecture
+  gpuinfo::reg_count(app);
+  gpuinfo::warp_size(app);
+
+  // Buffers
+  gpuinfo::buf_cacheline_size(app);
+  gpuinfo::buf_bandwidth(app);
+  gpuinfo::ubo_bandwidth(app);
+  gpuinfo::shared_mem_bandwidth(app);
+
+  // Textures
+  gpuinfo::tex_bandwidth(app);
+  gpuinfo::tex_cacheline_concurr(app);
+
+  return 0;
+}