From f5683a6df73df84c2223fb7fefa95f0e19dbebe4 Mon Sep 17 00:00:00 2001 From: estebanpadilla Date: Fri, 26 Jul 2024 11:47:43 -0700 Subject: [PATCH] Add metric for 3D texture cache line size (#4421) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4421 Differential Revision: https://internalfb.com/D60246121 --- .../gpuinfo/glsl/tex_cacheline_size.glsl | 39 ++++++ .../gpuinfo/glsl/tex_cacheline_size.yaml | 14 +++ backends/vulkan/tools/gpuinfo/src/app.cpp | 113 +++++++++++++++++- 3 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.glsl create mode 100644 backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.yaml diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.glsl new file mode 100644 index 00000000000..62659c7bb88 --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.glsl @@ -0,0 +1,39 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} +#define VEC4_T ${texel_type(DTYPE)} + +layout(std430) buffer; + +${layout_declare_sampler(0, "r", "in_tex", DTYPE)} +${layout_declare_buffer(1, "w", "out_buf", DTYPE, "PRECISION", False)} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int niter = 1; + +void main() { + vec4 sum = vec4(0); + int i = 0; + for (; i < niter; ++i){ + $if DIM == 0: + sum += texelFetch(in_tex, ivec3(gl_GlobalInvocationID[0], 0, 0), 0); + $elif DIM == 1: + sum += texelFetch(in_tex, ivec3(0, gl_GlobalInvocationID[0], 0), 0); + $elif DIM == 2: + sum += texelFetch(in_tex, ivec3(0, 0, gl_GlobalInvocationID[0]), 0); + } + + // This is to ensure no compiler optimizations occur + vec4 zero = vec4(i>>31); + + out_buf[0] = sum + zero; +} diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.yaml new file mode 100644 index 00000000000..99002aff298 --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.yaml @@ -0,0 +1,14 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +tex_cacheline_size: + parameter_names_with_default_values: + DTYPE: float + generate_variant_forall: + DIM: + - RANGE: [0, 2] + shader_variants: + - NAME: tex_cacheline_size diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp index c33e8a011d9..42631702f5e 100644 --- a/backends/vulkan/tools/gpuinfo/src/app.cpp +++ b/backends/vulkan/tools/gpuinfo/src/app.cpp @@ -291,12 +291,122 @@ class App { if (stride >= MAX_STRIDE) { std::cout << "Unable to conclude a top level buffer cacheline size." << std::endl; - cacheline_size = MAX_STRIDE; + cacheline_size = MAX_STRIDE * sizeof(float); } std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl; } + // Textures are drastically different from buffers in terms of data layout. + // While buffers are a contiguous range of memory, textures are opaque objects + // defined by the vendor and it is possible that nearby points of data are not + // neighboring in memory. Likewise, data points are accessed in + // multi-dimensional patches instead of simple lines. This makes the stride + // method for figuring out the cache line size not applicable. To go around + // this, this experiment runs an increasing amount of threads accessing + // different datapoints in the texture and measures latency. If the cache line + // is big enough for all threads to access it at the same time, latency will + // be low. When there are more threads than what a single cache line can + // handle, a second line must be fetched, increasing latency in a measurable + // way. With this, we can find the cache line size of all three dimensions. + void tex_cacheline_size() { + if (!_enabled("tex_cacheline_size")) { + std::cout << "Skipped Texture Cacheline Size" << std::endl; + return; + } + + const double COMPENSATE = _get_config("tex_cacheline_size", "compensate"); + const double THRESHOLD = _get_config("tex_cacheline_size", "threshold"); + + uint32_t concur_nthread_by_dim[3]; + + for (int dim = 0; dim < 3; ++dim) { + std::cout << std::endl; + std::cout << "------ Texture Cacheline Size (dim = " << dim << ") ------" + << std::endl; + + uint32_t NITER; + + const uint32_t IMG_OTHER_EDGE = dim == 0 ? max_tex_width_ + : dim == 1 ? max_tex_height_ + : max_tex_depth_; + + const uint32_t MAX_NTHREAD = std::min(nthread_logic_, IMG_OTHER_EDGE); + + uint32_t& concur_nthread = concur_nthread_by_dim[dim]; + + auto bench = [&](uint32_t nthread) { + std::vector sizes_whd = { + max_tex_width_, max_tex_height_, max_tex_depth_}; + + auto sizes_nchw = _whd_to_nchw(sizes_whd); + + vTensor in_tensor = + api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); + + // Single vec4 + StorageBuffer out_buf(context(), vkapi::kFloat, 4); + + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "tex_cacheline_size_" + std::to_string(dim); + + auto time = benchmark_on_gpu(shader_name, 100, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {nthread, 1, 1}, + {nthread, 1, 1}, + {SV(NITER)}, + VK_NULL_HANDLE, + 0, + in_tensor.image(), + out_buf.buffer()); + }); + return time; + }; + + ensure_min_niter(1000, NITER, [&]() { return bench(1); }); + + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + uint32_t nthread = 1; + for (; nthread <= MAX_NTHREAD; ++nthread) { + double time = bench(nthread); + std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time + << std::endl; + + if (dj.push(time)) { + concur_nthread = nthread - 1; + std::cout << "Can concurrently access " << concur_nthread + << "px with " << "minimal cost along dim=" << dim + << std::endl; + break; + } + } + if (nthread >= MAX_NTHREAD) { + std::cout + << "Unable to conclude a top level texture cacheline size for dim " + << dim << std::endl; + } else { + concur_nthread_by_dim[dim] = concur_nthread; + } + } + + uint32_t TEXEL_SIZE = 4 * sizeof(float); + const uint32_t concur_nthread_x = concur_nthread_by_dim[0]; + const uint32_t concur_nthread_y = concur_nthread_by_dim[1]; + + uint32_t cacheline_size = TEXEL_SIZE * + std::max(concur_nthread_x, concur_nthread_y) / + std::min(concur_nthread_x, concur_nthread_y); + + std::cout << "TextureCachelineSize," << cacheline_size << std::endl; + + std::string cacheline_dim; + cacheline_dim = concur_nthread_x >= concur_nthread_y ? "X" : "Y"; + std::cout << "TextureCachelineDim," << cacheline_dim << std::endl; + } + private: void _bandwidth(std::string memtype, uint32_t range) { auto memtype_lower = memtype; @@ -689,6 +799,7 @@ int main(int argc, const char** argv) { app.shared_mem_bandwidth(); app.warp_size(); app.tex_bandwidth(); + app.tex_cacheline_size(); return 0; }