pytorch · Esteb37 · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024
diff --git a/backends/vulkan/tools/gpuinfo/config.json b/backends/vulkan/tools/gpuinfo/config.json
@@ -0,0 +1,48 @@
+{
+  "reg_count": {
+    "enabled": true,
+    "threshold": 3,
+    "compensate": 0.1
+  },
+  "buf_cacheline_size": {
+    "enabled": true,
+    "threshold": 10,
+    "compensate": 0.1
+  },
+  "buffer_bandwidth": {
+    "enabled": true,
+    "range": 134217728,
+    "nflush": 4,
+    "nunroll": 16,
+    "niter": 10
+  },
+  "ubo_bandwidth": {
+    "enabled": true,
+    "range": 134217728,
+    "nflush": 4,
+    "nunroll": 16,
+    "niter": 10
+  },
+  "shared_bandwidth": {
+    "enabled": true,
+    "nflush": 4,
+    "nunroll": 16,
+    "niter": 10
+  },
+  "warp_size": {
+    "enabled": true,
+    "threshold": 3,
+    "compensate": 0.1
+  },
+  "tex_bandwidth": {
+    "enabled": true,
+    "nflush": 4,
+    "nunroll": 16,
+    "niter": 10
+  },
+  "tex_cacheline_concurr": {
+    "enabled": true,
+    "threshold": 3,
+    "compensate": 0.1
+  }
+}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_sampler(0, "r", "A", DTYPE)}
+${layout_declare_buffer(1, "w", "B", DTYPE, "PRECISION", False)}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int niter = 1;
+layout(constant_id = 4) const int nvec = 1;
+layout(constant_id = 5) const int local_group_size = 1;
+
+void main() {
+    // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
+    // This will help us limit address accessing to a specific set of unique
+    // addresses depending on the access size we want to measure.
+    const int addr_mask = nvec - 1;
+    vec4 sum = vec4(0);
+
+    // This is to distribute the accesses to unique addresses across the workgroups, once the
+    // size of the access excedes the workgroup width.
+    const uint workgroup_width = local_group_size * niter * ${NUNROLL};
+    uint offset = (gl_WorkGroupID[0] * workgroup_width  + gl_LocalInvocationID[0]) & addr_mask;
+
+    int i = 0;
+    for (; i < niter; ++i){
+      VEC4_T in_texel;
+      $for j in range(int(NUNROLL)):
+        $if DIM == 0:
+            in_texel = texelFetch(A, ivec3(offset, 0, 0), 0);
+        $elif DIM == 1:
+            in_texel = texelFetch(A, ivec3(0, offset, 0), 0);
+        $elif DIM == 2:
+            in_texel = texelFetch(A, ivec3(0, 0, offset), 0);
+
+        sum *= in_texel;
+
+        // On each unroll, a new unique address will be accessed through the offset,
+        // limited by the address mask to a specific set of unique addresses
+        offset = (offset + local_group_size) & addr_mask;
+    }
+
+    // This is to ensure no compiler optimizations occur
+    vec4 zero = vec4(i>>31);
+
+    B[gl_LocalInvocationID[0]] = sum + zero;
+}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+tex_bandwidth:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NUNROLL: "16"
+  generate_variant_forall:
+    DIM:
+      - RANGE: [0, 2]
+  shader_variants:
+    - NAME: tex_bandwidth
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_sampler(0, "r", "in_tex", DTYPE)}
+${layout_declare_buffer(1, "w", "out_buf", DTYPE, "PRECISION", False)}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int niter = 1;
+
+void main() {
+    vec4 sum = vec4(0);
+    int i = 0;
+    for (; i < niter; ++i){
+        $if DIM == 0:
+            sum += texelFetch(in_tex, ivec3(gl_GlobalInvocationID[0], 0, 0), 0);
+        $elif DIM == 1:
+            sum +=  texelFetch(in_tex, ivec3(0, gl_GlobalInvocationID[0], 0), 0);
+        $elif DIM == 2:
+            sum +=  texelFetch(in_tex, ivec3(0, 0, gl_GlobalInvocationID[0]), 0);
+    }
+
+    // This is to ensure no compiler optimizations occur
+    vec4 zero = vec4(i>>31);
+
+    out_buf[0] = sum + zero;
+}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+tex_cacheline_concurr:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DIM:
+      - RANGE: [0, 2]
+  shader_variants:
+    - NAME: tex_cacheline_concurr
diff --git a/backends/vulkan/tools/gpuinfo/include/app.h b/backends/vulkan/tools/gpuinfo/include/app.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+#include <folly/json.h>
+#include <fstream>
+#include <iostream>
+
+#include "utils.h"
+
+namespace gpuinfo {
+
+class App {
+ private:
+  folly::dynamic config_;
+
+ public:
+  size_t buf_cache_size;
+  uint32_t max_shared_mem_size;
+  uint32_t sm_count;
+  uint32_t nthread_logic;
+  uint32_t subgroup_size;
+  uint32_t max_tex_width;
+  uint32_t max_tex_height;
+  uint32_t max_tex_depth;
+
+  App() {
+    context()->initialize_querypool();
+
+    std::cout << context()->adapter_ptr()->stringize() << std::endl
+              << std::endl;
+
+    auto cl_device = get_cl_device();
+
+    sm_count = cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+    nthread_logic = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+    buf_cache_size = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
+    max_shared_mem_size = cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
+    max_tex_width = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_WIDTH>();
+    max_tex_height = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_HEIGHT>();
+    max_tex_depth = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_DEPTH>();
+
+    VkPhysicalDeviceSubgroupProperties subgroup_props{};
+    VkPhysicalDeviceProperties2 props2{};
+
+    props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
+    props2.pNext = &subgroup_props;
+    subgroup_props.sType =
+        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
+    vkGetPhysicalDeviceProperties2(
+        context()->adapter_ptr()->physical_handle(), &props2);
+    subgroup_size = subgroup_props.subgroupSize;
+
+    std::cout << std::endl;
+    std::cout << "SM count," << sm_count << std::endl;
+    std::cout << "Logic Thread Count," << nthread_logic << std::endl;
+    std::cout << "Cache Size," << buf_cache_size << std::endl;
+    std::cout << "Shared Memory Size," << max_shared_mem_size << std::endl;
+    std::cout << "SubGroup Size," << subgroup_size << std::endl;
+    std::cout << "MaxTexWidth," << max_tex_width << std::endl;
+    std::cout << "MaxTexHeight," << max_tex_height << std::endl;
+    std::cout << "MaxTexDepth," << max_tex_depth << std::endl;
+  }
+
+  float get_config(const std::string& test, const std::string& key) const {
+    if (config_[test].empty()) {
+      throw std::runtime_error("Missing config for " + test);
+    }
+
+    if (!config_[test][key].isNumber()) {
+      throw std::runtime_error(
+          "Config for " + test + "." + key + " is not a number");
+    }
+
+    float value;
+    if (config_[test][key].isDouble()) {
+      value = config_[test][key].getDouble();
+    } else {
+      value = config_[test][key].getInt();
+    }
+
+    std::cout << "Read value for " << test << "." << key << " = " << value
+              << std::endl;
+    return value;
+  }
+
+  bool enabled(const std::string& test) const {
+    if (config_.empty() || config_[test].empty() ||
+        !config_[test]["enabled"].isBool()) {
+      return true;
+    }
+    return config_[test]["enabled"].getBool();
+  }
+
+  void load_config(std::string file_path) {
+    std::ifstream file(file_path);
+    std::stringstream buffer;
+    buffer << file.rdbuf();
+    const std::string json_str = buffer.str();
+    if (json_str.empty()) {
+      throw std::runtime_error(
+          "Failed to read config file from " + file_path + ".");
+    }
+    config_ = folly::parseJson(json_str);
+  }
+};
+} // namespace gpuinfo