diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
new file mode 100644
index 00000000000..d848fc04754
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_sampler(0, "r", "A", DTYPE)}
+${layout_declare_buffer(1, "w", "B", DTYPE, "PRECISION", False)}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int niter = 1;
+layout(constant_id = 4) const int nvec = 1;
+layout(constant_id = 5) const int local_group_size = 1;
+
+void main() {
+    // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
+    // This will help us limit address accessing to a specific set of unique
+    // addresses depending on the access size we want to measure.
+    const int addr_mask = nvec - 1;
+    vec4 sum = vec4(0);
+
+    // This is to distribute the accesses to unique addresses across the workgroups, once the
+    // size of the access excedes the workgroup width.
+    const uint workgroup_width = local_group_size * niter * ${NUNROLL};
+    uint offset = (gl_WorkGroupID[0] * workgroup_width  + gl_LocalInvocationID[0]) & addr_mask;
+
+    int i = 0;
+    for (; i < niter; ++i){
+      VEC4_T in_texel;
+      $for j in range(int(NUNROLL)):
+        $if DIM == 0:
+            in_texel = texelFetch(A, ivec3(offset, 0, 0), 0);
+        $elif DIM == 1:
+            in_texel = texelFetch(A, ivec3(0, offset, 0), 0);
+        $elif DIM == 2:
+            in_texel = texelFetch(A, ivec3(0, 0, offset), 0);
+
+        sum *= in_texel;
+
+        // On each unroll, a new unique address will be accessed through the offset,
+        // limited by the address mask to a specific set of unique addresses
+        offset = (offset + local_group_size) & addr_mask;
+    }
+
+    // This is to ensure no compiler optimizations occur
+    vec4 zero = vec4(i>>31);
+
+    B[gl_LocalInvocationID[0]] = sum + zero;
+}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml
new file mode 100644
index 00000000000..84da6938fd4
--- /dev/null
+++ b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+tex_bandwidth:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NUNROLL: "16"
+  generate_variant_forall:
+    DIM:
+      - RANGE: [0, 2]
+  shader_variants:
+    - NAME: tex_bandwidth
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
index 8facdb51601..92eef840687 100644
--- a/backends/vulkan/tools/gpuinfo/src/app.cpp
+++ b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -22,6 +22,9 @@ class App {
   uint32_t sm_count_;
   uint32_t nthread_logic_;
   uint32_t subgroup_size_;
+  uint32_t max_tex_width_;
+  uint32_t max_tex_height_;
+  uint32_t max_tex_depth_;
 
  public:
   App() {
@@ -36,6 +39,9 @@ class App {
     nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
     buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
     max_shared_mem_size_ = cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
+    max_tex_width_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_WIDTH>();
+    max_tex_height_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_HEIGHT>();
+    max_tex_depth_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_DEPTH>();
 
     VkPhysicalDeviceSubgroupProperties subgroup_props{};
     VkPhysicalDeviceProperties2 props2{};
@@ -54,6 +60,9 @@ class App {
     std::cout << "Cache Size," << buf_cache_size_ << std::endl;
     std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl;
     std::cout << "SubGroup Size," << subgroup_size_ << std::endl;
+    std::cout << "MaxTexWidth," << max_tex_width_ << std::endl;
+    std::cout << "MaxTexHeight," << max_tex_height_ << std::endl;
+    std::cout << "MaxTexDepth," << max_tex_depth_ << std::endl;
   }
 
   void reg_count() {
@@ -308,6 +317,15 @@ class App {
               << std::endl;
   }
 
+  std::vector<int64_t> _whd_to_nchw(std::vector<int64_t> sizes) {
+    const int64_t W = sizes[0];
+    const int64_t H = sizes[1];
+    const int64_t D = sizes[2];
+
+    // Channels-packed: {W, H, D} = {W, H, (C / 4) * N}
+    return {1, D * 4, H, W};
+  }
+
  public:
   void buf_bandwidth() {
     std::cout << "\n------ Memory Bandwidth ------" << std::endl;
@@ -323,12 +341,105 @@ class App {
     const uint32_t RANGE = 128 * 1024 * 1024;
     _bandwidth("UBO", RANGE);
   }
+
   void shared_mem_bandwidth() {
     std::cout << "\n------ Shared Bandwidth ------" << std::endl;
     const uint32_t RANGE = max_shared_mem_size_;
     _bandwidth("Shared", RANGE);
   }
 
+  void tex_bandwidth() {
+    for (int dim = 0; dim < 3; dim++) {
+      std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
+                << std::endl;
+      const uint32_t MAX_SIZE = dim == 0 ? max_tex_width_
+          : dim == 1                     ? max_tex_height_
+                                         : max_tex_depth_;
+
+      // rgba, float
+      const uint32_t VEC_WIDTH = 4;
+      const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
+      const uint32_t NVEC = MAX_SIZE;
+
+      const uint32_t RANGE = NVEC * VEC_SIZE;
+
+      // Cache lines flushed
+      const uint32_t NFLUSH = 4;
+      // Number of loop unrolls. Changing this value requires an equal change in
+      // tex_bandwidth.yaml
+      const uint32_t NUNROLL = 16;
+      // Number of iterations. Increasing this value reduces noise in exchange
+      // for higher latency.
+      const uint32_t NITER = 10;
+      // Number of memory reads per thread
+      const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
+      // Number of threads needed to read all texells
+      const uint32_t NTHREAD = NVEC;
+      // Occupy all threads
+      const uint32_t local_x = nthread_logic_;
+      // Ensure that global is a multiple of local, and distribute across all
+      // SMs
+      const uint32_t global_x =
+          (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH;
+
+      auto shader_name = "tex_bandwidth_" + std::to_string(dim);
+
+      std::vector<int64_t> sizes_whd = {MAX_SIZE, 1, 1};
+      if (dim == 1) {
+        sizes_whd = {1, MAX_SIZE, 1};
+      } else if (dim == 2) {
+        sizes_whd = {1, 1, MAX_SIZE};
+      }
+      auto sizes_nchw = _whd_to_nchw(sizes_whd);
+
+      vTensor in_tensor =
+          api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
+
+      auto bench = [&](uint32_t access_size, uint32_t dim) {
+        // Number of texels that fit in this iteration
+        const uint32_t ntexel_access = access_size / VEC_SIZE;
+
+        StorageBuffer out_buf(
+            context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_);
+        vkapi::PipelineBarrier pipeline_barrier{};
+
+        auto time = benchmark_on_gpu(shader_name, 10, [&]() {
+          context()->submit_compute_job(
+              VK_KERNEL_FROM_STR(shader_name),
+              pipeline_barrier,
+              {global_x, 1, 1},
+              {local_x, 1, 1},
+              {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)},
+              VK_NULL_HANDLE,
+              0,
+              in_tensor.image(),
+              out_buf.buffer());
+        });
+
+        const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
+        double gbps = SIZE_TRANS * 1e-3 / time;
+        std::cout << "Texture bandwidth accessing \t" << access_size
+                  << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
+                  << "\tus)" << std::endl;
+        return gbps;
+      };
+
+      double max_bandwidth = 0;
+      double min_bandwidth = DBL_MAX;
+      for (uint32_t access_size = VEC_SIZE; access_size < RANGE;
+           access_size *= 2) {
+        double gbps = bench(access_size, dim);
+        max_bandwidth = std::max(gbps, max_bandwidth);
+        min_bandwidth = std::min(gbps, min_bandwidth);
+      }
+
+      std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth
+                << std::endl;
+      std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth
+                << std::endl;
+    }
+  }
+
   // Warp size is a difficult metric to obtain because the hardware limitations
   // do not always coincide with the way the SM divides the workload. For
   // instance, the hardware can have a warp size of 64 threads, but an SM might
@@ -492,6 +603,7 @@ int main(int argc, const char** argv) {
   app.ubo_bandwidth();
   app.shared_mem_bandwidth();
   app.warp_size();
+  app.tex_bandwidth();
 
   return 0;
 }