From 5c211eb814555a6c8f3d7c47d6a5082d33b4dd3f Mon Sep 17 00:00:00 2001 From: Esteban Padilla Cerdio Date: Fri, 26 Jul 2024 11:53:44 -0700 Subject: [PATCH] Refactor and class split (#4432) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4432 Big classes are scary ☹️ This diff subdivides the tests into categories, places them as functions inside the gpuinfo namespace, instead of as part of the App class, and the App class is now only for persisting device information and configuration. Differential Revision: D60290882 --- backends/vulkan/tools/gpuinfo/include/app.h | 116 +++ .../tools/gpuinfo/include/architecture.h | 287 +++++++ .../vulkan/tools/gpuinfo/include/buffers.h | 197 +++++ .../vulkan/tools/gpuinfo/include/textures.h | 226 +++++ backends/vulkan/tools/gpuinfo/include/utils.h | 9 + backends/vulkan/tools/gpuinfo/src/app.cpp | 805 ------------------ backends/vulkan/tools/gpuinfo/src/main.cpp | 40 + 7 files changed, 875 insertions(+), 805 deletions(-) create mode 100644 backends/vulkan/tools/gpuinfo/include/app.h create mode 100644 backends/vulkan/tools/gpuinfo/include/architecture.h create mode 100644 backends/vulkan/tools/gpuinfo/include/buffers.h create mode 100644 backends/vulkan/tools/gpuinfo/include/textures.h delete mode 100644 backends/vulkan/tools/gpuinfo/src/app.cpp create mode 100644 backends/vulkan/tools/gpuinfo/src/main.cpp diff --git a/backends/vulkan/tools/gpuinfo/include/app.h b/backends/vulkan/tools/gpuinfo/include/app.h new file mode 100644 index 00000000000..21e3258280d --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/include/app.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +#include "utils.h" + +namespace gpuinfo { + +class App { + private: + folly::dynamic config_; + + public: + size_t buf_cache_size; + uint32_t max_shared_mem_size; + uint32_t sm_count; + uint32_t nthread_logic; + uint32_t subgroup_size; + uint32_t max_tex_width; + uint32_t max_tex_height; + uint32_t max_tex_depth; + + App() { + { + context()->initialize_querypool(); + + std::cout << context()->adapter_ptr()->stringize() << std::endl + << std::endl; + + auto cl_device = get_cl_device(); + + sm_count = cl_device.getInfo(); + nthread_logic = cl_device.getInfo(); + buf_cache_size = cl_device.getInfo(); + max_shared_mem_size = cl_device.getInfo(); + max_tex_width = cl_device.getInfo(); + max_tex_height = cl_device.getInfo(); + max_tex_depth = cl_device.getInfo(); + + VkPhysicalDeviceSubgroupProperties subgroup_props{}; + VkPhysicalDeviceProperties2 props2{}; + + props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; + props2.pNext = &subgroup_props; + subgroup_props.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES; + vkGetPhysicalDeviceProperties2( + context()->adapter_ptr()->physical_handle(), &props2); + subgroup_size = subgroup_props.subgroupSize; + + std::cout << std::endl; + std::cout << "SM count," << sm_count << std::endl; + std::cout << "Logic Thread Count," << nthread_logic << std::endl; + std::cout << "Cache Size," << buf_cache_size << std::endl; + std::cout << "Shared Memory Size," << max_shared_mem_size << std::endl; + std::cout << "SubGroup Size," << subgroup_size << std::endl; + std::cout << "MaxTexWidth," << max_tex_width << std::endl; + std::cout << "MaxTexHeight," << max_tex_height << std::endl; + std::cout << "MaxTexDepth," << max_tex_depth << std::endl; + } + } + + float get_config(const std::string& test, const std::string& key) { + if (config_[test].empty()) { + throw std::runtime_error("Missing config for " + test); + } + + if (!config_[test][key].isNumber()) { + throw std::runtime_error( + "Config for " + test + "." + key + " is not a number"); + } + + float value; + if (config_[test][key].isDouble()) { + value = config_[test][key].getDouble(); + } else { + value = config_[test][key].getInt(); + } + + std::cout << "Read value for " << test << "." << key << " = " << value + << std::endl; + return value; + } + + bool enabled(const std::string& test) { + if (config_.empty() || config_[test].empty() || + !config_[test]["enabled"].isBool()) { + return true; + } + return config_[test]["enabled"].getBool(); + } + + void load_config(std::string file_path) { + std::ifstream file(file_path); + std::stringstream buffer; + buffer << file.rdbuf(); + const std::string json_str = buffer.str(); + if (json_str.empty()) { + throw std::runtime_error( + "Failed to read config file from " + file_path + "."); + } + config_ = folly::parseJson(json_str); + } +}; +} // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h new file mode 100644 index 00000000000..63f0786a805 --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/include/architecture.h @@ -0,0 +1,287 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include "app.h" +#include "stats.h" +#include "utils.h" + +using namespace vkapi; + +namespace gpuinfo { + +void reg_count(App& app) { + if (!app.enabled("reg_count")) { + std::cout << "Skipped Register Count" << std::endl; + return; + } + + std::cout << std::endl; + std::cout << "------ Register Count ------" << std::endl; + const uint32_t NREG_MIN = 1; + const uint32_t NREG_MAX = 512; + const uint32_t NREG_STEP = 1; + + const double COMPENSATE = app.get_config("reg_count", "compensate"); + const double THRESHOLD = app.get_config("reg_count", "threshold"); + + const uint32_t NGRP_MIN = 1; + const uint32_t NGRP_MAX = 64; + const uint32_t NGRP_STEP = 1; + + uint32_t NITER; + + auto bench = [&](uint32_t ngrp, uint32_t nreg) { + StorageBuffer buffer(context(), vkapi::kFloat, 1); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "reg_count_" + std::to_string(nreg); + + auto time = benchmark_on_gpu(shader_name, 100, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {1, ngrp, 1}, + {1, 1, 1}, + {SV(NITER)}, + VK_NULL_HANDLE, + 0, + buffer.buffer()); + }); + return time; + }; + + std::cout << "Calculating NITER..." << std::endl; + ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); }); + std::cout << "NITER," << NITER << std::endl; + + uint32_t nreg_max; + + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + uint32_t nreg = NREG_MIN; + for (; nreg <= NREG_MAX; nreg += NREG_STEP) { + double time = bench(1, nreg); + std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << std::endl; + if (dj.push(time)) { + nreg -= NREG_STEP; + nreg_max = nreg; + break; + } + } + if (nreg >= NREG_MAX) { + std::cout << "Unable to conclude a maximal register count" << std::endl; + nreg_max = NREG_STEP; + } else { + std::cout << nreg_max << " registers are available at most" << std::endl; + } + + auto find_ngrp_by_nreg = [&](const uint32_t nreg) { + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) { + auto time = bench(ngrp, nreg); + std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp + << ", time=" << time << " us" << std::endl; + + if (dj.push(time)) { + ngrp -= NGRP_STEP; + std::cout << "Using " << nreg << " registers can have " << ngrp + << " concurrent single-thread workgroups" << std::endl; + return ngrp; + } + } + std::cout + << "Unable to conclude a maximum number of concurrent single-thread workgroups when " + << nreg << " registers are occupied" << std::endl; + return (uint32_t)1; + }; + + uint32_t ngrp_full, ngrp_half; + ngrp_full = find_ngrp_by_nreg(nreg_max); + ngrp_half = find_ngrp_by_nreg(nreg_max / 2); + + std::string reg_ty; + + if (ngrp_full * 1.5 < ngrp_half) { + std::cout << "All physical threads in an sm share " << nreg_max + << " registers" << std::endl; + reg_ty = "Pooled"; + + } else { + std::cout << "Each physical thread has " << nreg_max << " registers" + << std::endl; + reg_ty = "Dedicated"; + } + + std::cout << std::endl << std::endl; + std::cout << "NITER," << NITER << std::endl; + std::cout << "Max registers," << nreg_max << std::endl; + std::cout << "Concurrent full single thread workgroups," << ngrp_full + << std::endl; + std::cout << "Concurrent half single thread workgroups," << ngrp_half + << std::endl; + std::cout << "Register type," << reg_ty << std::endl; +} + +// Warp size is a difficult metric to obtain because the hardware limitations +// do not always coincide with the way the SM divides the workload. For +// instance, the hardware can have a warp size of 64 threads, but an SM might +// be able to simulate concurrency of 128 threads with a single scheduler. + +// Because of this, it is important to measure the warp size different ways, +// that can evidence both the physical limitations of the hardware, and the +// actual behavior of the driver. + +// Additionally,the SM can behave in two different ways when the assigned +// workload is smaller than the warp size. + +// In Case 1, like ARM Mali, the SM can assign dummy workloads to fill empty +// threads and maintain a uniform workload. + +// In Case 2, like in Adreno, the driver might decide to pack multiple works +// together and dispatch them at once. +void warp_size(App& app, bool verbose = false) { + if (!app.enabled("warp_size")) { + std::cout << "Skipped Warp Size" << std::endl; + return; + } + + std::cout << "\n------ Warp Size ------" << std::endl; + + // Method A: Stress test with a kernel that uses complex ALU operations like + // integer division to avoid latency hiding. Increase the number of threads + // until a jump in latency is detected. + + // This timing-based method helps us identify physical warp sizes. It also + // helps with Case 2, when threads of multiple warps are managed by the same + // scheduler at the same time. + const double COMPENSATE = app.get_config("warp_size", "compensate"); + const double THRESHOLD = app.get_config("warp_size", "threshold"); + + uint32_t NITER; + + auto bench = [&](uint32_t nthread) { + StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "warp_size_physical"; + + auto time = benchmark_on_gpu(shader_name, 10, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + // Large number of work groups selected to potentially saturate all + // ALUs and thus have a better baseline for comparison. + {nthread, 1024, 1}, + {nthread, 1, 1}, + {SV(NITER)}, + VK_NULL_HANDLE, + 0, + out_buf.buffer()); + }); + + return time; + }; + + ensure_min_niter(1000, NITER, [&]() { return bench(1); }); + + uint32_t warp_size = app.subgroup_size; + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + + // We increase the number of threads until we hit a jump in the data. + uint32_t nthread = 1; + for (; nthread <= app.nthread_logic; ++nthread) { + double time = bench(nthread); + std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)" + << std::endl; + if (dj.push(time)) { + warp_size = nthread - 1; + break; + } + } + if (nthread >= app.nthread_logic) { + std::cout + << "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size" + << std::endl; + } + + // Method B: Let all the threads in a warp race and atomically fetch-add + // a counter, then store the counter values to the output buffer in the + // scheduling order of these threads. If all the order numbers follow an + // ascending order, then the threads are likely executing within a warp. + // Threads in different warps are not managed by the same scheduler, so they + // would race for a same ID out of order, unaware of each other. + + // This method evidences the actual driver behavior when running + // concurrency, regardless of the physical limitations of the hardware. + + // Likewise, this method helps us identify warp sizes when the SM + // sub-divides its ALUs into independent groups, like the three execution + // engines in a Mali G76 core. It helps warp-probing in Case 1 because it + // doesn't depend on kernel timing, so the extra wait time doesn't lead to + // inaccuracy. + auto bench_sm = [&](uint32_t nthread) { + StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "warp_size_scheduler"; + + benchmark_on_gpu(shader_name, 1, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {nthread, 1, 1}, + {nthread, 1, 1}, + {}, + VK_NULL_HANDLE, + 0, + out_buf.buffer()); + }); + + std::vector data(app.nthread_logic); + copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes()); + + if (verbose) { + std::stringstream ss; + for (auto j = 0; j < nthread; ++j) { + ss << data[j] << " "; + } + std::cout << ss.str() << std::endl; + } + + // Check until which point is the data in ascending order. + int32_t last = -1; + int32_t j = 0; + for (; j < nthread; ++j) { + if (last >= data[j]) { + break; + } + last = data[j]; + } + + return j; + }; + + // Test increasing sizes until the data is no longer in ascending order. + uint32_t warp_size_scheduler = warp_size; + int i = 1; + for (; i <= app.nthread_logic; ++i) { + uint32_t nascend = bench_sm(i); + if (nascend != i) { + warp_size_scheduler = nascend; + break; + } + } + if (i > app.nthread_logic) { + std::cout << "Unable to conclude an SM Warp Size." << std::endl; + } + + std::cout << "PhysicalWarpSize," << warp_size << std::endl; + std::cout << "SMWarpSize," << warp_size_scheduler << std::endl; +} +}; // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h new file mode 100644 index 00000000000..7f108a3e13d --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/include/buffers.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include "app.h" +#include "stats.h" +#include "utils.h" + +using namespace vkapi; + +namespace gpuinfo { + +void buf_cacheline_size(App& app) { + if (!app.enabled("buf_cacheline_size")) { + std::cout << "Skipped Buffer Cacheline Size" << std::endl; + return; + } + + std::cout << std::endl; + std::cout << "------ Buffer Cacheline Size ------" << std::endl; + + const double COMPENSATE = app.get_config("buf_cacheline_size", "compensate"); + const double THRESHOLD = app.get_config("buf_cacheline_size", "threshold"); + + const uint32_t PITCH = app.buf_cache_size / app.nthread_logic; + const uint32_t BUF_SIZE = app.buf_cache_size; + const uint32_t MAX_STRIDE = PITCH; + + uint32_t NITER; + + auto bench = [&](int stride) { + StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); + StorageBuffer out_buf(context(), vkapi::kFloat, 1); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "buf_cacheline_size"; + + auto time = benchmark_on_gpu(shader_name, 100, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {app.nthread_logic, 1, 1}, + {app.nthread_logic, 1, 1}, + {SV(NITER), SV(stride), SV(PITCH)}, + VK_NULL_HANDLE, + 0, + in_buf.buffer(), + out_buf.buffer()); + }); + return time; + }; + + ensure_min_niter(1000, NITER, [&]() { return bench(1); }); + + uint32_t cacheline_size; + + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + uint32_t stride = 1; + for (; stride <= MAX_STRIDE; ++stride) { + double time = bench(stride); + std::cout << "Testing stride=\t" << stride << "\t, time=\t" << time + << std::endl; + + if (dj.push(time)) { + cacheline_size = stride * sizeof(float); + break; + } + } + if (stride >= MAX_STRIDE) { + std::cout << "Unable to conclude a top level buffer cacheline size." + << std::endl; + cacheline_size = MAX_STRIDE * sizeof(float); + } + + std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl; +} + +void _bandwidth(App& app, std::string memtype, uint32_t range) { + auto memtype_lower = memtype; + std::transform( + memtype_lower.begin(), + memtype_lower.end(), + memtype_lower.begin(), + [](unsigned char c) { return std::tolower(c); }); + + auto test_name = memtype_lower + "_bandwidth"; + + // Cache lines flushed + const uint32_t NFLUSH = app.get_config(test_name, "nflush"); + // Number of loop unrolls. Changing this value requires an equal change in + // buf_bandwidth.yaml + const uint32_t NUNROLL = app.get_config(test_name, "nunroll"); + // Number of iterations. Increasing this value reduces noise in exchange for + // higher latency. + const uint32_t NITER = app.get_config(test_name, "niter"); + // Vector dimensions (vec4) + const uint32_t VEC_WIDTH = 4; + const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); + // Number of vectors that fit in the selected memory space + const uint32_t NVEC = range / VEC_SIZE; + // Number of memory reads per thread + const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; + // Number of threads needed to read al l vectors + // The thread count doesn't divide by thread workload in shared memory + // because of the limited memory size. + const uint32_t NTHREAD = memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD; + // Occupy all threads + const uint32_t local_x = app.nthread_logic; + // Ensure that global is a multiple of local, and distribute across all SMs + const uint32_t global_x = + (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH; + + auto bench = [&](uint32_t access_size) { + // Number of vectors that fit in this iteration + const uint32_t nvec_access = access_size / VEC_SIZE; + + StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); + StorageBuffer out_buf( + context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "buf_bandwidth_" + memtype_lower; + + auto time = benchmark_on_gpu(shader_name, 10, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {global_x, 1, 1}, + {local_x, 1, 1}, + {SV(NITER), SV(nvec_access), SV(local_x)}, + VK_NULL_HANDLE, + 0, + in_buf.buffer(), + out_buf.buffer()); + }); + + const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; + auto gbps = SIZE_TRANS * 1e-3 / time; + std::cout << memtype << " bandwidth accessing \t" << access_size + << "\tB unique data is \t" << gbps << " \tgbps (\t" << time + << "\tus)" << std::endl; + return gbps; + }; + + double max_bandwidth = 0; + double min_bandwidth = DBL_MAX; + for (uint32_t access_size = VEC_SIZE; access_size < range; access_size *= 2) { + double gbps = bench(access_size); + max_bandwidth = std::max(gbps, max_bandwidth); + min_bandwidth = std::min(gbps, min_bandwidth); + } + + std::cout << "Max" << memtype << "Bandwidth (GB/s)," << max_bandwidth + << std::endl; + std::cout << "Min" << memtype << "Bandwidth (GB/s)," << min_bandwidth + << std::endl; +} + +void buf_bandwidth(App& app) { + if (!app.enabled("buffer_bandwidth")) { + std::cout << "Skipped Memory Bandwidth" << std::endl; + return; + } + + std::cout << "\n------ Memory Bandwidth ------" << std::endl; + // Maximum memory space read - 128MB + // For regular devices, bandwidth plateaus at less memory than this, so more + // is not needed. + const uint32_t RANGE = app.get_config("buffer_bandwidth", "range"); + _bandwidth(app, "Buffer", RANGE); +} + +void ubo_bandwidth(App& app) { + if (!app.enabled("ubo_bandwidth")) { + std::cout << "Skipped UBO Bandwidth" << std::endl; + return; + } + + std::cout << "\n------ UBO Bandwidth ------" << std::endl; + const uint32_t RANGE = app.get_config("ubo_bandwidth", "range"); + _bandwidth(app, "UBO", RANGE); +} + +void shared_mem_bandwidth(App& app) { + if (!app.enabled("shared_mem_bandwidth")) { + std::cout << "Skipped Shared Memory Bandwidth" << std::endl; + return; + } + + std::cout << "\n------ Shared Bandwidth ------" << std::endl; + const uint32_t RANGE = app.max_shared_mem_size; + _bandwidth(app, "Shared", RANGE); +} +} // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h new file mode 100644 index 00000000000..f8874306c0a --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/include/textures.h @@ -0,0 +1,226 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include "app.h" +#include "stats.h" +#include "utils.h" + +namespace gpuinfo { + +// To improve render quality, the textures are usually filtered and +// interpolated to smooth out color steps in the original image, which +// requires multiple access to a 3D area nearby the point of interpolation. +// Textures are therefore drastically different from buffers, they need to +// provide rapid access to data points in a 3D patch; and each data point has +// four elements, as it's originally been designed for, corresponding to the +// components in the RGB color space with an extra alpha channel. Also, unlike +// a buffer which refers to a contiguous range of memory, a texture is an +// opaque object, defined by GPU vendors and the users usually have no +// knowledge about its actual data layout, so it is possible that the memory +// *does not* layout linearly. These characteristics prevent us from designing +// the addressing and vectorization of data. The only general optimization we +// can do is to align the amount of data accessed at a time to the relatively +// small top-level cache system, i.e., the L1 texture cache. +// +// In the experiment, we assume L1 cacheline size is small enough that +// several threads reading float4s can exceed it. In both direction, along the +// width and the height, each logically concurrent thread in an SM reads a +// float4. Such memory access should be satisfied by a single cache fetch, but +// if the cache is not large enough to contain all requested data, multiple +// fetches will significantly increase access latency. +void tex_cacheline_size(App& app) { + if (!app.enabled("tex_cacheline_size")) { + std::cout << "Skipped Texture Cacheline Size" << std::endl; + return; + } + + const double COMPENSATE = app.get_config("tex_cacheline_size", "compensate"); + const double THRESHOLD = app.get_config("tex_cacheline_size", "threshold"); + + uint32_t concur_nthread_by_dim[3]; + + for (int dim = 0; dim < 3; ++dim) { + std::cout << std::endl; + std::cout << "------ Texture Cacheline Size (dim = " << dim << ") ------" + << std::endl; + + uint32_t NITER; + + const uint32_t IMG_OTHER_EDGE = dim == 0 ? app.max_tex_width + : dim == 1 ? app.max_tex_height + : app.max_tex_depth; + + const uint32_t MAX_NTHREAD = std::min(app.nthread_logic, IMG_OTHER_EDGE); + + uint32_t& concur_nthread = concur_nthread_by_dim[dim]; + + auto bench = [&](uint32_t nthread) { + std::vector sizes_whd = { + app.max_tex_width, app.max_tex_height, app.max_tex_depth}; + + auto sizes_nchw = whd_to_nchw(sizes_whd); + + vTensor in_tensor = + api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); + + // Single vec4 + StorageBuffer out_buf(context(), vkapi::kFloat, 4); + + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "tex_cacheline_size_" + std::to_string(dim); + + auto time = benchmark_on_gpu(shader_name, 100, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {nthread, 1, 1}, + {nthread, 1, 1}, + {SV(NITER)}, + VK_NULL_HANDLE, + 0, + in_tensor.image(), + out_buf.buffer()); + }); + return time; + }; + + ensure_min_niter(1000, NITER, [&]() { return bench(1); }); + + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + uint32_t nthread = 1; + for (; nthread <= MAX_NTHREAD; ++nthread) { + double time = bench(nthread); + std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time + << std::endl; + + if (dj.push(time)) { + concur_nthread = nthread - 1; + std::cout << "Can concurrently access " << concur_nthread << "px with " + << "minimal cost along dim=" << dim << std::endl; + break; + } + } + if (nthread >= MAX_NTHREAD) { + std::cout << "Unable to conclude a top level texture cacheline size." + << std::endl; + } else { + concur_nthread_by_dim[dim] = concur_nthread; + } + } + + uint32_t TEXEL_SIZE = 4 * sizeof(float); + const uint32_t concur_nthread_x = concur_nthread_by_dim[0]; + const uint32_t concur_nthread_y = concur_nthread_by_dim[1]; + + uint32_t cacheline_size = TEXEL_SIZE * + std::max(concur_nthread_x, concur_nthread_y) / + std::min(concur_nthread_x, concur_nthread_y); + + std::cout << "TextureCachelineSize," << cacheline_size << std::endl; + + std::string cacheline_dim; + cacheline_dim = concur_nthread_x >= concur_nthread_y ? "X" : "Y"; + std::cout << "TextureCachelineDim," << cacheline_dim << std::endl; +} + +void tex_bandwidth(App& app) { + if (!app.enabled("tex_bandwidth")) { + std::cout << "Skipped Texture Bandwidth" << std::endl; + return; + } + + for (int dim = 0; dim < 3; dim++) { + std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------" + << std::endl; + const uint32_t MAX_SIZE = dim == 0 ? app.max_tex_width + : dim == 1 ? app.max_tex_height + : app.max_tex_depth; + + // rgba, float + const uint32_t VEC_WIDTH = 4; + const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); + const uint32_t NVEC = MAX_SIZE; + + const uint32_t RANGE = NVEC * VEC_SIZE; + + // Cache lines flushed + const uint32_t NFLUSH = app.get_config("tex_bandwidth", "nflush"); + // Number of loop unrolls. Changing this value requires an equal change in + // tex_bandwidth.yaml + const uint32_t NUNROLL = app.get_config("tex_bandwidth", "nunroll"); + // Number of iterations. Increasing this value reduces noise in exchange + // for higher latency. + const uint32_t NITER = app.get_config("tex_bandwidth", "niter"); + // Number of memory reads per thread + const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; + // Number of threads needed to read all texells + const uint32_t NTHREAD = NVEC; + // Occupy all threads + const uint32_t local_x = app.nthread_logic; + // Ensure that global is a multiple of local, and distribute across all + // SMs + const uint32_t global_x = + (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH; + + auto shader_name = "tex_bandwidth_" + std::to_string(dim); + + std::vector sizes_whd = {MAX_SIZE, 1, 1}; + if (dim == 1) { + sizes_whd = {1, MAX_SIZE, 1}; + } else if (dim == 2) { + sizes_whd = {1, 1, MAX_SIZE}; + } + auto sizes_nchw = whd_to_nchw(sizes_whd); + + vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); + + auto bench = [&](uint32_t access_size, uint32_t dim) { + // Number of texels that fit in this iteration + const uint32_t ntexel_access = access_size / VEC_SIZE; + + StorageBuffer out_buf( + context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto time = benchmark_on_gpu(shader_name, 10, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {global_x, 1, 1}, + {local_x, 1, 1}, + {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)}, + VK_NULL_HANDLE, + 0, + in_tensor.image(), + out_buf.buffer()); + }); + + const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; + double gbps = SIZE_TRANS * 1e-3 / time; + std::cout << "Texture bandwidth accessing \t" << access_size + << "\tB unique data is \t" << gbps << " \tgbps (\t" << time + << "\tus)" << std::endl; + return gbps; + }; + + double max_bandwidth = 0; + double min_bandwidth = DBL_MAX; + for (uint32_t access_size = VEC_SIZE; access_size < RANGE; + access_size *= 2) { + double gbps = bench(access_size, dim); + max_bandwidth = std::max(gbps, max_bandwidth); + min_bandwidth = std::min(gbps, min_bandwidth); + } + + std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth + << std::endl; + std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth + << std::endl; + } +} +} // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/utils.h b/backends/vulkan/tools/gpuinfo/include/utils.h index 231fb32c5a9..887cb443ef4 100644 --- a/backends/vulkan/tools/gpuinfo/include/utils.h +++ b/backends/vulkan/tools/gpuinfo/include/utils.h @@ -54,6 +54,15 @@ void ensure_min_niter( } } +std::vector whd_to_nchw(std::vector sizes) { + const int64_t W = sizes[0]; + const int64_t H = sizes[1]; + const int64_t D = sizes[2]; + + // Channels-packed: {W, H, D} = {W, H, (C / 4) * N} + return {1, D * 4, H, W}; +} + cl_platform_id get_cl_platform_id() { cl_uint nplatform_id; clGetPlatformIDs(0, nullptr, &nplatform_id); diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp deleted file mode 100644 index 42631702f5e..00000000000 --- a/backends/vulkan/tools/gpuinfo/src/app.cpp +++ /dev/null @@ -1,805 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include -#include - -#include "stats.h" -#include "utils.h" - -using namespace vkapi; - -class App { - private: - size_t buf_cache_size_; - uint32_t max_shared_mem_size_; - uint32_t sm_count_; - uint32_t nthread_logic_; - uint32_t subgroup_size_; - uint32_t max_tex_width_; - uint32_t max_tex_height_; - uint32_t max_tex_depth_; - folly::dynamic config_; - - std::vector _whd_to_nchw(std::vector sizes) { - const int64_t W = sizes[0]; - const int64_t H = sizes[1]; - const int64_t D = sizes[2]; - - // Channels-packed: {W, H, D} = {W, H, (C / 4) * N} - return {1, D * 4, H, W}; - } - - float _get_config(const std::string& test, const std::string& key) { - if (config_[test].empty()) { - throw std::runtime_error("Missing config for " + test); - } - - if (!config_[test][key].isNumber()) { - throw std::runtime_error( - "Config for " + test + "." + key + " is not a number"); - } - - float value; - if (config_[test][key].isDouble()) { - value = config_[test][key].getDouble(); - } else { - value = config_[test][key].getInt(); - } - - std::cout << "Read value for " << test << "." << key << " = " << value - << std::endl; - return value; - } - - bool _enabled(const std::string& test) { - if (config_.empty() || config_[test].empty() || - !config_[test]["enabled"].isBool()) { - return true; - } - return config_[test]["enabled"].getBool(); - } - - public: - App() { - context()->initialize_querypool(); - - std::cout << context()->adapter_ptr()->stringize() << std::endl - << std::endl; - - auto cl_device = get_cl_device(); - - sm_count_ = cl_device.getInfo(); - nthread_logic_ = cl_device.getInfo(); - buf_cache_size_ = cl_device.getInfo(); - max_shared_mem_size_ = cl_device.getInfo(); - max_tex_width_ = cl_device.getInfo(); - max_tex_height_ = cl_device.getInfo(); - max_tex_depth_ = cl_device.getInfo(); - - VkPhysicalDeviceSubgroupProperties subgroup_props{}; - VkPhysicalDeviceProperties2 props2{}; - - props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; - props2.pNext = &subgroup_props; - subgroup_props.sType = - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES; - vkGetPhysicalDeviceProperties2( - context()->adapter_ptr()->physical_handle(), &props2); - subgroup_size_ = subgroup_props.subgroupSize; - - std::cout << std::endl; - std::cout << "SM count," << sm_count_ << std::endl; - std::cout << "Logic Thread Count," << nthread_logic_ << std::endl; - std::cout << "Cache Size," << buf_cache_size_ << std::endl; - std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl; - std::cout << "SubGroup Size," << subgroup_size_ << std::endl; - std::cout << "MaxTexWidth," << max_tex_width_ << std::endl; - std::cout << "MaxTexHeight," << max_tex_height_ << std::endl; - std::cout << "MaxTexDepth," << max_tex_depth_ << std::endl; - } - - void load_config(std::string file_path) { - std::ifstream file(file_path); - std::stringstream buffer; - buffer << file.rdbuf(); - const std::string json_str = buffer.str(); - if (json_str.empty()) { - throw std::runtime_error( - "Failed to read config file from " + file_path + "."); - } - config_ = folly::parseJson(json_str); - } - - void reg_count() { - if (!_enabled("reg_count")) { - std::cout << "Skipped Register Count" << std::endl; - return; - } - - std::cout << std::endl; - std::cout << "------ Register Count ------" << std::endl; - const uint32_t NREG_MIN = 1; - const uint32_t NREG_MAX = 512; - const uint32_t NREG_STEP = 1; - - const double COMPENSATE = _get_config("reg_count", "compensate"); - const double THRESHOLD = _get_config("reg_count", "threshold"); - - const uint32_t NGRP_MIN = 1; - const uint32_t NGRP_MAX = 64; - const uint32_t NGRP_STEP = 1; - - uint32_t NITER; - - auto bench = [&](uint32_t ngrp, uint32_t nreg) { - StorageBuffer buffer(context(), vkapi::kFloat, 1); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "reg_count_" + std::to_string(nreg); - - auto time = benchmark_on_gpu(shader_name, 100, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {1, ngrp, 1}, - {1, 1, 1}, - {SV(NITER)}, - VK_NULL_HANDLE, - 0, - buffer.buffer()); - }); - return time; - }; - - std::cout << "Calculating NITER..." << std::endl; - ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); }); - std::cout << "NITER," << NITER << std::endl; - - uint32_t nreg_max; - - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - uint32_t nreg = NREG_MIN; - for (; nreg <= NREG_MAX; nreg += NREG_STEP) { - double time = bench(1, nreg); - std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time - << std::endl; - if (dj.push(time)) { - nreg -= NREG_STEP; - nreg_max = nreg; - break; - } - } - if (nreg >= NREG_MAX) { - std::cout << "Unable to conclude a maximal register count" << std::endl; - nreg_max = NREG_STEP; - } else { - std::cout << nreg_max << " registers are available at most" << std::endl; - } - - auto find_ngrp_by_nreg = [&](const uint32_t nreg) { - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) { - auto time = bench(ngrp, nreg); - std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp - << ", time=" << time << " us" << std::endl; - - if (dj.push(time)) { - ngrp -= NGRP_STEP; - std::cout << "Using " << nreg << " registers can have " << ngrp - << " concurrent single-thread workgroups" << std::endl; - return ngrp; - } - } - std::cout - << "Unable to conclude a maximum number of concurrent single-thread workgroups when " - << nreg << " registers are occupied" << std::endl; - return (uint32_t)1; - }; - - uint32_t ngrp_full, ngrp_half; - ngrp_full = find_ngrp_by_nreg(nreg_max); - ngrp_half = find_ngrp_by_nreg(nreg_max / 2); - - std::string reg_ty; - - if (ngrp_full * 1.5 < ngrp_half) { - std::cout << "All physical threads in an sm share " << nreg_max - << " registers" << std::endl; - reg_ty = "Pooled"; - - } else { - std::cout << "Each physical thread has " << nreg_max << " registers" - << std::endl; - reg_ty = "Dedicated"; - } - - std::cout << std::endl << std::endl; - std::cout << "NITER," << NITER << std::endl; - std::cout << "Max registers," << nreg_max << std::endl; - std::cout << "Concurrent full single thread workgroups," << ngrp_full - << std::endl; - std::cout << "Concurrent half single thread workgroups," << ngrp_half - << std::endl; - std::cout << "Register type," << reg_ty << std::endl; - } - - void buf_cacheline_size() { - if (!_enabled("buf_cacheline_size")) { - std::cout << "Skipped Buffer Cacheline Size" << std::endl; - return; - } - - std::cout << std::endl; - std::cout << "------ Buffer Cacheline Size ------" << std::endl; - - const double COMPENSATE = _get_config("buf_cacheline_size", "compensate"); - const double THRESHOLD = _get_config("buf_cacheline_size", "threshold"); - - const uint32_t PITCH = buf_cache_size_ / nthread_logic_; - const uint32_t BUF_SIZE = buf_cache_size_; - const uint32_t MAX_STRIDE = PITCH; - - uint32_t NITER; - - auto bench = [&](int stride) { - StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); - StorageBuffer out_buf(context(), vkapi::kFloat, 1); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "buf_cacheline_size"; - - auto time = benchmark_on_gpu(shader_name, 100, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {nthread_logic_, 1, 1}, - {nthread_logic_, 1, 1}, - {SV(NITER), SV(stride), SV(PITCH)}, - VK_NULL_HANDLE, - 0, - in_buf.buffer(), - out_buf.buffer()); - }); - return time; - }; - - ensure_min_niter(1000, NITER, [&]() { return bench(1); }); - - uint32_t cacheline_size; - - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - uint32_t stride = 1; - for (; stride <= MAX_STRIDE; ++stride) { - double time = bench(stride); - std::cout << "Testing stride=\t" << stride << "\t, time=\t" << time - << std::endl; - - if (dj.push(time)) { - cacheline_size = stride * sizeof(float); - break; - } - } - if (stride >= MAX_STRIDE) { - std::cout << "Unable to conclude a top level buffer cacheline size." - << std::endl; - cacheline_size = MAX_STRIDE * sizeof(float); - } - - std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl; - } - - // Textures are drastically different from buffers in terms of data layout. - // While buffers are a contiguous range of memory, textures are opaque objects - // defined by the vendor and it is possible that nearby points of data are not - // neighboring in memory. Likewise, data points are accessed in - // multi-dimensional patches instead of simple lines. This makes the stride - // method for figuring out the cache line size not applicable. To go around - // this, this experiment runs an increasing amount of threads accessing - // different datapoints in the texture and measures latency. If the cache line - // is big enough for all threads to access it at the same time, latency will - // be low. When there are more threads than what a single cache line can - // handle, a second line must be fetched, increasing latency in a measurable - // way. With this, we can find the cache line size of all three dimensions. - void tex_cacheline_size() { - if (!_enabled("tex_cacheline_size")) { - std::cout << "Skipped Texture Cacheline Size" << std::endl; - return; - } - - const double COMPENSATE = _get_config("tex_cacheline_size", "compensate"); - const double THRESHOLD = _get_config("tex_cacheline_size", "threshold"); - - uint32_t concur_nthread_by_dim[3]; - - for (int dim = 0; dim < 3; ++dim) { - std::cout << std::endl; - std::cout << "------ Texture Cacheline Size (dim = " << dim << ") ------" - << std::endl; - - uint32_t NITER; - - const uint32_t IMG_OTHER_EDGE = dim == 0 ? max_tex_width_ - : dim == 1 ? max_tex_height_ - : max_tex_depth_; - - const uint32_t MAX_NTHREAD = std::min(nthread_logic_, IMG_OTHER_EDGE); - - uint32_t& concur_nthread = concur_nthread_by_dim[dim]; - - auto bench = [&](uint32_t nthread) { - std::vector sizes_whd = { - max_tex_width_, max_tex_height_, max_tex_depth_}; - - auto sizes_nchw = _whd_to_nchw(sizes_whd); - - vTensor in_tensor = - api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); - - // Single vec4 - StorageBuffer out_buf(context(), vkapi::kFloat, 4); - - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "tex_cacheline_size_" + std::to_string(dim); - - auto time = benchmark_on_gpu(shader_name, 100, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {nthread, 1, 1}, - {nthread, 1, 1}, - {SV(NITER)}, - VK_NULL_HANDLE, - 0, - in_tensor.image(), - out_buf.buffer()); - }); - return time; - }; - - ensure_min_niter(1000, NITER, [&]() { return bench(1); }); - - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - uint32_t nthread = 1; - for (; nthread <= MAX_NTHREAD; ++nthread) { - double time = bench(nthread); - std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time - << std::endl; - - if (dj.push(time)) { - concur_nthread = nthread - 1; - std::cout << "Can concurrently access " << concur_nthread - << "px with " << "minimal cost along dim=" << dim - << std::endl; - break; - } - } - if (nthread >= MAX_NTHREAD) { - std::cout - << "Unable to conclude a top level texture cacheline size for dim " - << dim << std::endl; - } else { - concur_nthread_by_dim[dim] = concur_nthread; - } - } - - uint32_t TEXEL_SIZE = 4 * sizeof(float); - const uint32_t concur_nthread_x = concur_nthread_by_dim[0]; - const uint32_t concur_nthread_y = concur_nthread_by_dim[1]; - - uint32_t cacheline_size = TEXEL_SIZE * - std::max(concur_nthread_x, concur_nthread_y) / - std::min(concur_nthread_x, concur_nthread_y); - - std::cout << "TextureCachelineSize," << cacheline_size << std::endl; - - std::string cacheline_dim; - cacheline_dim = concur_nthread_x >= concur_nthread_y ? "X" : "Y"; - std::cout << "TextureCachelineDim," << cacheline_dim << std::endl; - } - - private: - void _bandwidth(std::string memtype, uint32_t range) { - auto memtype_lower = memtype; - std::transform( - memtype_lower.begin(), - memtype_lower.end(), - memtype_lower.begin(), - [](unsigned char c) { return std::tolower(c); }); - - auto test_name = memtype_lower + "_bandwidth"; - - // Cache lines flushed - const uint32_t NFLUSH = _get_config(test_name, "nflush"); - // Number of loop unrolls. Changing this value requires an equal change in - // buf_bandwidth.yaml - const uint32_t NUNROLL = _get_config(test_name, "nunroll"); - // Number of iterations. Increasing this value reduces noise in exchange for - // higher latency. - const uint32_t NITER = _get_config(test_name, "niter"); - // Vector dimensions (vec4) - const uint32_t VEC_WIDTH = 4; - const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); - // Number of vectors that fit in the selected memory space - const uint32_t NVEC = range / VEC_SIZE; - // Number of memory reads per thread - const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; - // Number of threads needed to read al l vectors - // The thread count doesn't divide by thread workload in shared memory - // because of the limited memory size. - const uint32_t NTHREAD = - memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD; - // Occupy all threads - const uint32_t local_x = nthread_logic_; - // Ensure that global is a multiple of local, and distribute across all SMs - const uint32_t global_x = - (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH; - - auto bench = [&](uint32_t access_size) { - // Number of vectors that fit in this iteration - const uint32_t nvec_access = access_size / VEC_SIZE; - - StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); - StorageBuffer out_buf( - context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "buf_bandwidth_" + memtype_lower; - - auto time = benchmark_on_gpu(shader_name, 10, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {global_x, 1, 1}, - {local_x, 1, 1}, - {SV(NITER), SV(nvec_access), SV(local_x)}, - VK_NULL_HANDLE, - 0, - in_buf.buffer(), - out_buf.buffer()); - }); - - const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; - auto gbps = SIZE_TRANS * 1e-3 / time; - std::cout << memtype << " bandwidth accessing \t" << access_size - << "\tB unique data is \t" << gbps << " \tgbps (\t" << time - << "\tus)" << std::endl; - return gbps; - }; - - double max_bandwidth = 0; - double min_bandwidth = DBL_MAX; - for (uint32_t access_size = VEC_SIZE; access_size < range; - access_size *= 2) { - double gbps = bench(access_size); - max_bandwidth = std::max(gbps, max_bandwidth); - min_bandwidth = std::min(gbps, min_bandwidth); - } - - std::cout << "Max" << memtype << "Bandwidth (GB/s)," << max_bandwidth - << std::endl; - std::cout << "Min" << memtype << "Bandwidth (GB/s)," << min_bandwidth - << std::endl; - } - - public: - void buf_bandwidth() { - if (!_enabled("buffer_bandwidth")) { - std::cout << "Skipped Memory Bandwidth" << std::endl; - return; - } - - std::cout << "\n------ Memory Bandwidth ------" << std::endl; - // Maximum memory space read - 128MB - // For regular devices, bandwidth plateaus at less memory than this, so more - // is not needed. - const uint32_t RANGE = _get_config("buffer_bandwidth", "range"); - _bandwidth("Buffer", RANGE); - } - - void ubo_bandwidth() { - if (!_enabled("ubo_bandwidth")) { - std::cout << "Skipped UBO Bandwidth" << std::endl; - return; - } - - std::cout << "\n------ UBO Bandwidth ------" << std::endl; - const uint32_t RANGE = _get_config("ubo_bandwidth", "range"); - _bandwidth("UBO", RANGE); - } - - void shared_mem_bandwidth() { - if (!_enabled("shared_mem_bandwidth")) { - std::cout << "Skipped Shared Memory Bandwidth" << std::endl; - return; - } - - std::cout << "\n------ Shared Bandwidth ------" << std::endl; - const uint32_t RANGE = max_shared_mem_size_; - _bandwidth("Shared", RANGE); - } - - void tex_bandwidth() { - if (!_enabled("tex_bandwidth")) { - std::cout << "Skipped Texture Bandwidth" << std::endl; - return; - } - - for (int dim = 0; dim < 3; dim++) { - std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------" - << std::endl; - const uint32_t MAX_SIZE = dim == 0 ? max_tex_width_ - : dim == 1 ? max_tex_height_ - : max_tex_depth_; - - // rgba, float - const uint32_t VEC_WIDTH = 4; - const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); - const uint32_t NVEC = MAX_SIZE; - - const uint32_t RANGE = NVEC * VEC_SIZE; - - // Cache lines flushed - const uint32_t NFLUSH = _get_config("tex_bandwidth", "nflush"); - // Number of loop unrolls. Changing this value requires an equal change in - // tex_bandwidth.yaml - const uint32_t NUNROLL = _get_config("tex_bandwidth", "nunroll"); - // Number of iterations. Increasing this value reduces noise in exchange - // for higher latency. - const uint32_t NITER = _get_config("tex_bandwidth", "niter"); - // Number of memory reads per thread - const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; - // Number of threads needed to read all texells - const uint32_t NTHREAD = NVEC; - // Occupy all threads - const uint32_t local_x = nthread_logic_; - // Ensure that global is a multiple of local, and distribute across all - // SMs - const uint32_t global_x = - (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH; - - auto shader_name = "tex_bandwidth_" + std::to_string(dim); - - std::vector sizes_whd = {MAX_SIZE, 1, 1}; - if (dim == 1) { - sizes_whd = {1, MAX_SIZE, 1}; - } else if (dim == 2) { - sizes_whd = {1, 1, MAX_SIZE}; - } - auto sizes_nchw = _whd_to_nchw(sizes_whd); - - vTensor in_tensor = - api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); - - auto bench = [&](uint32_t access_size, uint32_t dim) { - // Number of texels that fit in this iteration - const uint32_t ntexel_access = access_size / VEC_SIZE; - - StorageBuffer out_buf( - context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto time = benchmark_on_gpu(shader_name, 10, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {global_x, 1, 1}, - {local_x, 1, 1}, - {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)}, - VK_NULL_HANDLE, - 0, - in_tensor.image(), - out_buf.buffer()); - }); - - const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; - double gbps = SIZE_TRANS * 1e-3 / time; - std::cout << "Texture bandwidth accessing \t" << access_size - << "\tB unique data is \t" << gbps << " \tgbps (\t" << time - << "\tus)" << std::endl; - return gbps; - }; - - double max_bandwidth = 0; - double min_bandwidth = DBL_MAX; - for (uint32_t access_size = VEC_SIZE; access_size < RANGE; - access_size *= 2) { - double gbps = bench(access_size, dim); - max_bandwidth = std::max(gbps, max_bandwidth); - min_bandwidth = std::min(gbps, min_bandwidth); - } - - std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth - << std::endl; - std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth - << std::endl; - } - } - - // Warp size is a difficult metric to obtain because the hardware limitations - // do not always coincide with the way the SM divides the workload. For - // instance, the hardware can have a warp size of 64 threads, but an SM might - // be able to simulate concurrency of 128 threads with a single scheduler. - - // Because of this, it is important to measure the warp size different ways, - // that can evidence both the physical limitations of the hardware, and the - // actual behavior of the driver. - - // Additionally,the SM can behave in two different ways when the assigned - // workload is smaller than the warp size. - - // In Case 1, like ARM Mali, the SM can assign dummy workloads to fill empty - // threads and maintain a uniform workload. - - // In Case 2, like in Adreno, the driver might decide to pack multiple works - // together and dispatch them at once. - void warp_size(bool verbose = false) { - if (!_enabled("warp_size")) { - std::cout << "Skipped Warp Size" << std::endl; - return; - } - - std::cout << "\n------ Warp Size ------" << std::endl; - - // Method A: Stress test with a kernel that uses complex ALU operations like - // integer division to avoid latency hiding. Increase the number of threads - // until a jump in latency is detected. - - // This timing-based method helps us identify physical warp sizes. It also - // helps with Case 2, when threads of multiple warps are managed by the same - // scheduler at the same time. - const double COMPENSATE = _get_config("warp_size", "compensate"); - const double THRESHOLD = _get_config("warp_size", "threshold"); - - uint32_t NITER; - - auto bench = [&](uint32_t nthread) { - StorageBuffer out_buf(context(), vkapi::kInt, nthread_logic_); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "warp_size_physical"; - - auto time = benchmark_on_gpu(shader_name, 10, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - // Large number of work groups selected to potentially saturate all - // ALUs and thus have a better baseline for comparison. - {nthread, 1024, 1}, - {nthread, 1, 1}, - {SV(NITER)}, - VK_NULL_HANDLE, - 0, - out_buf.buffer()); - }); - - return time; - }; - - ensure_min_niter(1000, NITER, [&]() { return bench(1); }); - - uint32_t warp_size = subgroup_size_; - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - - // We increase the number of threads until we hit a jump in the data. - uint32_t nthread = 1; - for (; nthread <= nthread_logic_; ++nthread) { - double time = bench(nthread); - std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)" - << std::endl; - if (dj.push(time)) { - warp_size = nthread - 1; - break; - } - } - if (nthread >= nthread_logic_) { - std::cout - << "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size" - << std::endl; - } - - // Method B: Let all the threads in a warp race and atomically fetch-add - // a counter, then store the counter values to the output buffer in the - // scheduling order of these threads. If all the order numbers follow an - // ascending order, then the threads are likely executing within a warp. - // Threads in different warps are not managed by the same scheduler, so they - // would race for a same ID out of order, unaware of each other. - - // This method evidences the actual driver behavior when running - // concurrency, regardless of the physical limitations of the hardware. - - // Likewise, this method helps us identify warp sizes when the SM - // sub-divides its ALUs into independent groups, like the three execution - // engines in a Mali G76 core. It helps warp-probing in Case 1 because it - // doesn't depend on kernel timing, so the extra wait time doesn't lead to - // inaccuracy. - auto bench_sm = [&](uint32_t nthread) { - StorageBuffer out_buf(context(), vkapi::kInt, nthread_logic_); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "warp_size_scheduler"; - - benchmark_on_gpu(shader_name, 1, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {nthread, 1, 1}, - {nthread, 1, 1}, - {}, - VK_NULL_HANDLE, - 0, - out_buf.buffer()); - }); - - std::vector data(nthread_logic_); - copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes()); - - if (verbose) { - std::stringstream ss; - for (auto j = 0; j < nthread; ++j) { - ss << data[j] << " "; - } - std::cout << ss.str() << std::endl; - } - - // Check until which point is the data in ascending order. - int32_t last = -1; - int32_t j = 0; - for (; j < nthread; ++j) { - if (last >= data[j]) { - break; - } - last = data[j]; - } - - return j; - }; - - // Test increasing sizes until the data is no longer in ascending order. - uint32_t warp_size_scheduler = warp_size; - int i = 1; - for (; i <= nthread_logic_; ++i) { - uint32_t nascend = bench_sm(i); - if (nascend != i) { - warp_size_scheduler = nascend; - break; - } - } - if (i > nthread_logic_) { - std::cout << "Unable to conclude an SM Warp Size." << std::endl; - } - - std::cout << "PhysicalWarpSize," << warp_size << std::endl; - std::cout << "SMWarpSize," << warp_size_scheduler << std::endl; - } -}; - -int main(int argc, const char** argv) { - App app; - - std::string file_path = "config.json"; - if (argc > 1) { - file_path = argv[1]; - }; - app.load_config(file_path); - - app.reg_count(); - app.buf_cacheline_size(); - app.buf_bandwidth(); - app.ubo_bandwidth(); - app.shared_mem_bandwidth(); - app.warp_size(); - app.tex_bandwidth(); - app.tex_cacheline_size(); - - return 0; -} diff --git a/backends/vulkan/tools/gpuinfo/src/main.cpp b/backends/vulkan/tools/gpuinfo/src/main.cpp new file mode 100644 index 00000000000..92b11ce9b4b --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/src/main.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "app.h" +#include "architecture.h" +#include "buffers.h" +#include "textures.h" + +using namespace vkapi; + +int main(int argc, const char** argv) { + gpuinfo::App app; + + std::string file_path = "config.json"; + if (argc > 1) { + file_path = argv[1]; + }; + app.load_config(file_path); + + // Architecture + gpuinfo::reg_count(app); + gpuinfo::warp_size(app); + + // Buffers + gpuinfo::buf_cacheline_size(app); + gpuinfo::buf_bandwidth(app); + gpuinfo::ubo_bandwidth(app); + gpuinfo::shared_mem_bandwidth(app); + + // Textures + gpuinfo::tex_bandwidth(app); + gpuinfo::tex_cacheline_size(app); + + return 0; +}