diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl new file mode 100644 index 00000000000..d848fc04754 --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl @@ -0,0 +1,59 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} +#define VEC4_T ${texel_type(DTYPE)} + +layout(std430) buffer; + +${layout_declare_sampler(0, "r", "A", DTYPE)} +${layout_declare_buffer(1, "w", "B", DTYPE, "PRECISION", False)} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int niter = 1; +layout(constant_id = 4) const int nvec = 1; +layout(constant_id = 5) const int local_group_size = 1; + +void main() { + // The address mask works as a modulo because x % 2^n == x & (2^n - 1). + // This will help us limit address accessing to a specific set of unique + // addresses depending on the access size we want to measure. + const int addr_mask = nvec - 1; + vec4 sum = vec4(0); + + // This is to distribute the accesses to unique addresses across the workgroups, once the + // size of the access excedes the workgroup width. + const uint workgroup_width = local_group_size * niter * ${NUNROLL}; + uint offset = (gl_WorkGroupID[0] * workgroup_width + gl_LocalInvocationID[0]) & addr_mask; + + int i = 0; + for (; i < niter; ++i){ + VEC4_T in_texel; + $for j in range(int(NUNROLL)): + $if DIM == 0: + in_texel = texelFetch(A, ivec3(offset, 0, 0), 0); + $elif DIM == 1: + in_texel = texelFetch(A, ivec3(0, offset, 0), 0); + $elif DIM == 2: + in_texel = texelFetch(A, ivec3(0, 0, offset), 0); + + sum *= in_texel; + + // On each unroll, a new unique address will be accessed through the offset, + // limited by the address mask to a specific set of unique addresses + offset = (offset + local_group_size) & addr_mask; + } + + // This is to ensure no compiler optimizations occur + vec4 zero = vec4(i>>31); + + B[gl_LocalInvocationID[0]] = sum + zero; +} diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml new file mode 100644 index 00000000000..84da6938fd4 --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml @@ -0,0 +1,15 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +tex_bandwidth: + parameter_names_with_default_values: + DTYPE: float + NUNROLL: "16" + generate_variant_forall: + DIM: + - RANGE: [0, 2] + shader_variants: + - NAME: tex_bandwidth diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp index 8facdb51601..92eef840687 100644 --- a/backends/vulkan/tools/gpuinfo/src/app.cpp +++ b/backends/vulkan/tools/gpuinfo/src/app.cpp @@ -22,6 +22,9 @@ class App { uint32_t sm_count_; uint32_t nthread_logic_; uint32_t subgroup_size_; + uint32_t max_tex_width_; + uint32_t max_tex_height_; + uint32_t max_tex_depth_; public: App() { @@ -36,6 +39,9 @@ class App { nthread_logic_ = cl_device.getInfo(); buf_cache_size_ = cl_device.getInfo(); max_shared_mem_size_ = cl_device.getInfo(); + max_tex_width_ = cl_device.getInfo(); + max_tex_height_ = cl_device.getInfo(); + max_tex_depth_ = cl_device.getInfo(); VkPhysicalDeviceSubgroupProperties subgroup_props{}; VkPhysicalDeviceProperties2 props2{}; @@ -54,6 +60,9 @@ class App { std::cout << "Cache Size," << buf_cache_size_ << std::endl; std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl; std::cout << "SubGroup Size," << subgroup_size_ << std::endl; + std::cout << "MaxTexWidth," << max_tex_width_ << std::endl; + std::cout << "MaxTexHeight," << max_tex_height_ << std::endl; + std::cout << "MaxTexDepth," << max_tex_depth_ << std::endl; } void reg_count() { @@ -308,6 +317,15 @@ class App { << std::endl; } + std::vector _whd_to_nchw(std::vector sizes) { + const int64_t W = sizes[0]; + const int64_t H = sizes[1]; + const int64_t D = sizes[2]; + + // Channels-packed: {W, H, D} = {W, H, (C / 4) * N} + return {1, D * 4, H, W}; + } + public: void buf_bandwidth() { std::cout << "\n------ Memory Bandwidth ------" << std::endl; @@ -323,12 +341,105 @@ class App { const uint32_t RANGE = 128 * 1024 * 1024; _bandwidth("UBO", RANGE); } + void shared_mem_bandwidth() { std::cout << "\n------ Shared Bandwidth ------" << std::endl; const uint32_t RANGE = max_shared_mem_size_; _bandwidth("Shared", RANGE); } + void tex_bandwidth() { + for (int dim = 0; dim < 3; dim++) { + std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------" + << std::endl; + const uint32_t MAX_SIZE = dim == 0 ? max_tex_width_ + : dim == 1 ? max_tex_height_ + : max_tex_depth_; + + // rgba, float + const uint32_t VEC_WIDTH = 4; + const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); + const uint32_t NVEC = MAX_SIZE; + + const uint32_t RANGE = NVEC * VEC_SIZE; + + // Cache lines flushed + const uint32_t NFLUSH = 4; + // Number of loop unrolls. Changing this value requires an equal change in + // tex_bandwidth.yaml + const uint32_t NUNROLL = 16; + // Number of iterations. Increasing this value reduces noise in exchange + // for higher latency. + const uint32_t NITER = 10; + // Number of memory reads per thread + const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; + // Number of threads needed to read all texells + const uint32_t NTHREAD = NVEC; + // Occupy all threads + const uint32_t local_x = nthread_logic_; + // Ensure that global is a multiple of local, and distribute across all + // SMs + const uint32_t global_x = + (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH; + + auto shader_name = "tex_bandwidth_" + std::to_string(dim); + + std::vector sizes_whd = {MAX_SIZE, 1, 1}; + if (dim == 1) { + sizes_whd = {1, MAX_SIZE, 1}; + } else if (dim == 2) { + sizes_whd = {1, 1, MAX_SIZE}; + } + auto sizes_nchw = _whd_to_nchw(sizes_whd); + + vTensor in_tensor = + api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); + + auto bench = [&](uint32_t access_size, uint32_t dim) { + // Number of texels that fit in this iteration + const uint32_t ntexel_access = access_size / VEC_SIZE; + + StorageBuffer out_buf( + context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto time = benchmark_on_gpu(shader_name, 10, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {global_x, 1, 1}, + {local_x, 1, 1}, + {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)}, + VK_NULL_HANDLE, + 0, + in_tensor.image(), + out_buf.buffer()); + }); + + const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; + double gbps = SIZE_TRANS * 1e-3 / time; + std::cout << "Texture bandwidth accessing \t" << access_size + << "\tB unique data is \t" << gbps << " \tgbps (\t" << time + << "\tus)" << std::endl; + return gbps; + }; + + double max_bandwidth = 0; + double min_bandwidth = DBL_MAX; + for (uint32_t access_size = VEC_SIZE; access_size < RANGE; + access_size *= 2) { + double gbps = bench(access_size, dim); + max_bandwidth = std::max(gbps, max_bandwidth); + min_bandwidth = std::min(gbps, min_bandwidth); + } + + std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth + << std::endl; + std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth + << std::endl; + } + } + // Warp size is a difficult metric to obtain because the hardware limitations // do not always coincide with the way the SM divides the workload. For // instance, the hardware can have a warp size of 64 threads, but an SM might @@ -492,6 +603,7 @@ int main(int argc, const char** argv) { app.ubo_bandwidth(); app.shared_mem_bandwidth(); app.warp_size(); + app.tex_bandwidth(); return 0; }