From e03181d1ff078d8b534e53aeab7ed4cce77ea7e3 Mon Sep 17 00:00:00 2001 From: Esteban Padilla Cerdio Date: Tue, 30 Jul 2024 13:31:12 -0700 Subject: [PATCH] Refactor and class split (#4432) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4432 Big classes are scary ☹️ This diff subdivides the tests into categories, places them as functions inside the gpuinfo namespace, instead of as part of the App class, and the App class is now only for persisting device information and configuration. Reviewed By: jorgep31415 Differential Revision: D60290882 fbshipit-source-id: b57f6e824be33320c01eebc5d5b72cbd2ad4c0cf --- backends/vulkan/tools/gpuinfo/config.json | 2 +- backends/vulkan/tools/gpuinfo/include/app.h | 114 +++ .../tools/gpuinfo/include/architecture.h | 285 +++++++ .../vulkan/tools/gpuinfo/include/buffers.h | 203 +++++ .../vulkan/tools/gpuinfo/include/textures.h | 207 +++++ backends/vulkan/tools/gpuinfo/include/utils.h | 9 + backends/vulkan/tools/gpuinfo/src/app.cpp | 790 ------------------ backends/vulkan/tools/gpuinfo/src/main.cpp | 40 + 8 files changed, 859 insertions(+), 791 deletions(-) create mode 100644 backends/vulkan/tools/gpuinfo/include/app.h create mode 100644 backends/vulkan/tools/gpuinfo/include/architecture.h create mode 100644 backends/vulkan/tools/gpuinfo/include/buffers.h create mode 100644 backends/vulkan/tools/gpuinfo/include/textures.h delete mode 100644 backends/vulkan/tools/gpuinfo/src/app.cpp create mode 100644 backends/vulkan/tools/gpuinfo/src/main.cpp diff --git a/backends/vulkan/tools/gpuinfo/config.json b/backends/vulkan/tools/gpuinfo/config.json index 7307f29503..afb5cbc6c5 100644 --- a/backends/vulkan/tools/gpuinfo/config.json +++ b/backends/vulkan/tools/gpuinfo/config.json @@ -23,7 +23,7 @@ "nunroll": 16, "niter": 10 }, - "shared_mem_bandwidth": { + "shared_bandwidth": { "enabled": true, "nflush": 4, "nunroll": 16, diff --git a/backends/vulkan/tools/gpuinfo/include/app.h b/backends/vulkan/tools/gpuinfo/include/app.h new file mode 100644 index 0000000000..a46e9e6b9a --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/include/app.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +#include "utils.h" + +namespace gpuinfo { + +class App { + private: + folly::dynamic config_; + + public: + size_t buf_cache_size; + uint32_t max_shared_mem_size; + uint32_t sm_count; + uint32_t nthread_logic; + uint32_t subgroup_size; + uint32_t max_tex_width; + uint32_t max_tex_height; + uint32_t max_tex_depth; + + App() { + context()->initialize_querypool(); + + std::cout << context()->adapter_ptr()->stringize() << std::endl + << std::endl; + + auto cl_device = get_cl_device(); + + sm_count = cl_device.getInfo(); + nthread_logic = cl_device.getInfo(); + buf_cache_size = cl_device.getInfo(); + max_shared_mem_size = cl_device.getInfo(); + max_tex_width = cl_device.getInfo(); + max_tex_height = cl_device.getInfo(); + max_tex_depth = cl_device.getInfo(); + + VkPhysicalDeviceSubgroupProperties subgroup_props{}; + VkPhysicalDeviceProperties2 props2{}; + + props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; + props2.pNext = &subgroup_props; + subgroup_props.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES; + vkGetPhysicalDeviceProperties2( + context()->adapter_ptr()->physical_handle(), &props2); + subgroup_size = subgroup_props.subgroupSize; + + std::cout << std::endl; + std::cout << "SM count," << sm_count << std::endl; + std::cout << "Logic Thread Count," << nthread_logic << std::endl; + std::cout << "Cache Size," << buf_cache_size << std::endl; + std::cout << "Shared Memory Size," << max_shared_mem_size << std::endl; + std::cout << "SubGroup Size," << subgroup_size << std::endl; + std::cout << "MaxTexWidth," << max_tex_width << std::endl; + std::cout << "MaxTexHeight," << max_tex_height << std::endl; + std::cout << "MaxTexDepth," << max_tex_depth << std::endl; + } + + float get_config(const std::string& test, const std::string& key) const { + if (config_[test].empty()) { + throw std::runtime_error("Missing config for " + test); + } + + if (!config_[test][key].isNumber()) { + throw std::runtime_error( + "Config for " + test + "." + key + " is not a number"); + } + + float value; + if (config_[test][key].isDouble()) { + value = config_[test][key].getDouble(); + } else { + value = config_[test][key].getInt(); + } + + std::cout << "Read value for " << test << "." << key << " = " << value + << std::endl; + return value; + } + + bool enabled(const std::string& test) const { + if (config_.empty() || config_[test].empty() || + !config_[test]["enabled"].isBool()) { + return true; + } + return config_[test]["enabled"].getBool(); + } + + void load_config(std::string file_path) { + std::ifstream file(file_path); + std::stringstream buffer; + buffer << file.rdbuf(); + const std::string json_str = buffer.str(); + if (json_str.empty()) { + throw std::runtime_error( + "Failed to read config file from " + file_path + "."); + } + config_ = folly::parseJson(json_str); + } +}; +} // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h new file mode 100644 index 0000000000..0d312ee87c --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/include/architecture.h @@ -0,0 +1,285 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include "app.h" +#include "stats.h" +#include "utils.h" + +using namespace vkapi; + +namespace gpuinfo { + +void reg_count(const App& app) { + if (!app.enabled("reg_count")) { + std::cout << "Skipped Register Count" << std::endl; + return; + } + + std::cout << std::endl; + std::cout << "------ Register Count ------" << std::endl; + const uint32_t NREG_MIN = 1; + const uint32_t NREG_MAX = 512; + const uint32_t NREG_STEP = 1; + + const double COMPENSATE = app.get_config("reg_count", "compensate"); + const double THRESHOLD = app.get_config("reg_count", "threshold"); + + const uint32_t NGRP_MIN = 1; + const uint32_t NGRP_MAX = 64; + const uint32_t NGRP_STEP = 1; + + uint32_t NITER; + + auto bench = [&](uint32_t ngrp, uint32_t nreg) { + StorageBuffer buffer(context(), vkapi::kFloat, 1); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "reg_count_" + std::to_string(nreg); + + auto time = benchmark_on_gpu(shader_name, 30, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {1, ngrp, 1}, + {1, 1, 1}, + {SV(NITER)}, + VK_NULL_HANDLE, + 0, + buffer.buffer()); + }); + return time; + }; + + ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); }); + + uint32_t nreg_max; + + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + uint32_t nreg = NREG_MIN; + for (; nreg <= NREG_MAX; nreg += NREG_STEP) { + double time = bench(1, nreg); + std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << "\tus" + << std::endl; + if (dj.push(time)) { + nreg -= NREG_STEP; + nreg_max = nreg; + break; + } + } + if (nreg >= NREG_MAX) { + std::cout << "Unable to conclude a maximal register count" << std::endl; + nreg_max = NREG_STEP; + } else { + std::cout << nreg_max << " registers are available at most" << std::endl; + } + + auto find_ngrp_by_nreg = [&](const uint32_t nreg) { + DtJumpFinder<3> dj(COMPENSATE, THRESHOLD); + for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) { + auto time = bench(ngrp, nreg); + std::cout << "Testing occupation (nreg=\t" << nreg << "\t); ngrp=\t" + << ngrp << "\t, time=\t" << time << "\tus" << std::endl; + + if (dj.push(time)) { + ngrp -= NGRP_STEP; + std::cout << "Using " << nreg << " registers can have " << ngrp + << " concurrent single-thread workgroups" << std::endl; + return ngrp; + } + } + std::cout + << "Unable to conclude a maximum number of concurrent single-thread workgroups when " + << nreg << " registers are occupied" << std::endl; + return (uint32_t)1; + }; + + uint32_t ngrp_full, ngrp_half; + ngrp_full = find_ngrp_by_nreg(nreg_max); + ngrp_half = find_ngrp_by_nreg(nreg_max / 2); + + std::string reg_ty; + + if (ngrp_full * 1.5 < ngrp_half) { + std::cout << "All physical threads in an sm share " << nreg_max + << " registers" << std::endl; + reg_ty = "Pooled"; + + } else { + std::cout << "Each physical thread has " << nreg_max << " registers" + << std::endl; + reg_ty = "Dedicated"; + } + + std::cout << std::endl << std::endl; + std::cout << "MaxRegisters," << nreg_max << std::endl; + std::cout << "ConcurrentWorkgroupsFullReg," << ngrp_full << std::endl; + std::cout << "ConcurrentWorkgroupsHalfReg," << ngrp_half << std::endl; + std::cout << "RegisterType," << reg_ty << std::endl; +} + +// Warp size is a difficult metric to obtain because the hardware limitations +// do not always coincide with the way the SM divides the workload. For +// instance, the hardware can have a warp size of 64 threads, but an SM might +// be able to simulate concurrency of 128 threads with a single scheduler. + +// Because of this, it is important to measure the warp size different ways, +// that can evidence both the physical limitations of the hardware, and the +// actual behavior of the driver. + +// Additionally,the SM can behave in two different ways when the assigned +// workload is smaller than the warp size. + +// In Case 1, like ARM Mali, the SM can assign dummy workloads to fill empty +// threads and maintain a uniform workload. + +// In Case 2, like in Adreno, the driver might decide to pack multiple works +// together and dispatch them at once. +void warp_size(const App& app, const bool verbose = false) { + if (!app.enabled("warp_size")) { + std::cout << "Skipped Warp Size" << std::endl; + return; + } + + std::cout << "\n------ Warp Size ------" << std::endl; + + // Method A: Stress test with a kernel that uses complex ALU operations like + // integer division to avoid latency hiding. Increase the number of threads + // until a jump in latency is detected. + + // This timing-based method helps us identify physical warp sizes. It also + // helps with Case 2, when threads of multiple warps are managed by the same + // scheduler at the same time. + const double COMPENSATE = app.get_config("warp_size", "compensate"); + const double THRESHOLD = app.get_config("warp_size", "threshold"); + + uint32_t NITER; + + auto bench = [&](uint32_t nthread) { + StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "warp_size_physical"; + + auto time = benchmark_on_gpu(shader_name, 10, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + // Large number of work groups selected to potentially saturate all + // ALUs and thus have a better baseline for comparison. + {nthread, 1024, 1}, + {nthread, 1, 1}, + {SV(NITER)}, + VK_NULL_HANDLE, + 0, + out_buf.buffer()); + }); + + return time; + }; + + ensure_min_niter(1000, NITER, [&]() { return bench(1); }); + + uint32_t warp_size = app.subgroup_size; + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + + // We increase the number of threads until we hit a jump in the data. + uint32_t nthread = 1; + for (; nthread <= app.nthread_logic; ++nthread) { + double time = bench(nthread); + std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)" + << std::endl; + if (dj.push(time)) { + warp_size = nthread - 1; + break; + } + } + if (nthread >= app.nthread_logic) { + std::cout + << "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size" + << std::endl; + } + + // Method B: Let all the threads in a warp race and atomically fetch-add + // a counter, then store the counter values to the output buffer in the + // scheduling order of these threads. If all the order numbers follow an + // ascending order, then the threads are likely executing within a warp. + // Threads in different warps are not managed by the same scheduler, so they + // would race for a same ID out of order, unaware of each other. + + // This method evidences the actual driver behavior when running + // concurrency, regardless of the physical limitations of the hardware. + + // Likewise, this method helps us identify warp sizes when the SM + // sub-divides its ALUs into independent groups, like the three execution + // engines in a Mali G76 core. It helps warp-probing in Case 1 because it + // doesn't depend on kernel timing, so the extra wait time doesn't lead to + // inaccuracy. + auto bench_sm = [&](uint32_t nthread) { + StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "warp_size_scheduler"; + + benchmark_on_gpu(shader_name, 1, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {nthread, 1, 1}, + {nthread, 1, 1}, + {}, + VK_NULL_HANDLE, + 0, + out_buf.buffer()); + }); + + std::vector data(app.nthread_logic); + copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes()); + + if (verbose) { + std::stringstream ss; + for (auto j = 0; j < nthread; ++j) { + ss << data[j] << " "; + } + std::cout << ss.str() << std::endl; + } + + // Check until which point is the data in ascending order. + int32_t last = -1; + int32_t j = 0; + for (; j < nthread; ++j) { + if (last >= data[j]) { + break; + } + last = data[j]; + } + + return j; + }; + + // Test increasing sizes until the data is no longer in ascending order. + uint32_t warp_size_scheduler = warp_size; + int i = 1; + for (; i <= app.nthread_logic; ++i) { + uint32_t nascend = bench_sm(i); + if (nascend != i) { + warp_size_scheduler = nascend; + break; + } + } + if (i > app.nthread_logic) { + std::cout << "Unable to conclude an SM Warp Size." << std::endl; + } + + std::cout << "PhysicalWarpSize," << warp_size << std::endl; + std::cout << "SMWarpSize," << warp_size_scheduler << std::endl; +} +}; // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h new file mode 100644 index 0000000000..8cb0da49ca --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/include/buffers.h @@ -0,0 +1,203 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include "app.h" +#include "stats.h" +#include "utils.h" + +using namespace vkapi; + +namespace gpuinfo { + +void buf_cacheline_size(const App& app) { + if (!app.enabled("buf_cacheline_size")) { + std::cout << "Skipped Buffer Cacheline Size" << std::endl; + return; + } + + std::cout << std::endl; + std::cout << "------ Buffer Cacheline Size ------" << std::endl; + + const double COMPENSATE = app.get_config("buf_cacheline_size", "compensate"); + const double THRESHOLD = app.get_config("buf_cacheline_size", "threshold"); + + const uint32_t PITCH = app.buf_cache_size / app.nthread_logic; + const uint32_t BUF_SIZE = app.buf_cache_size; + const uint32_t MAX_STRIDE = PITCH; + + uint32_t NITER; + + auto bench = [&](int stride) { + StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); + StorageBuffer out_buf(context(), vkapi::kFloat, 1); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "buf_cacheline_size"; + + auto time = benchmark_on_gpu(shader_name, 100, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {app.nthread_logic, 1, 1}, + {app.nthread_logic, 1, 1}, + {SV(NITER), SV(stride), SV(PITCH)}, + VK_NULL_HANDLE, + 0, + in_buf.buffer(), + out_buf.buffer()); + }); + return time; + }; + + ensure_min_niter(1000, NITER, [&]() { return bench(1); }); + + uint32_t cacheline_size; + + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + uint32_t stride = 1; + for (; stride <= MAX_STRIDE; ++stride) { + double time = bench(stride); + std::cout << "Testing stride=\t" << stride << "\t, time=\t" << time + << std::endl; + + if (dj.push(time)) { + cacheline_size = stride * sizeof(float); + break; + } + } + if (stride >= MAX_STRIDE) { + std::cout << "Unable to conclude a top level buffer cacheline size." + << std::endl; + cacheline_size = MAX_STRIDE * sizeof(float); + } + + std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl; +} + +void _bandwidth( + const App& app, + const std::string memtype, + const uint32_t range) { + auto memtype_lower = memtype; + std::transform( + memtype_lower.begin(), + memtype_lower.end(), + memtype_lower.begin(), + [](unsigned char c) { return std::tolower(c); }); + + auto test_name = memtype_lower + "_bandwidth"; + + // Cache lines flushed + const uint32_t NFLUSH = app.get_config(test_name, "nflush"); + // Number of loop unrolls. Changing this value requires an equal change in + // buf_bandwidth.yaml + const uint32_t NUNROLL = app.get_config(test_name, "nunroll"); + // Number of iterations. Increasing this value reduces noise in exchange for + // higher latency. + const uint32_t NITER = app.get_config(test_name, "niter"); + // Vector dimensions (vec4) + const uint32_t VEC_WIDTH = 4; + const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); + // Number of vectors that fit in the selected memory space + const uint32_t NVEC = range / VEC_SIZE; + // Number of memory reads per thread + const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; + // Number of threads needed to read al l vectors + // The thread count doesn't divide by thread workload in shared memory + // because of the limited memory size. + const uint32_t NTHREAD = memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD; + // Occupy all threads + const uint32_t local_x = app.nthread_logic; + // Ensure that global is a multiple of local, and distribute across all SMs + const uint32_t global_x = + (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH; + + auto bench = [&](uint32_t access_size) { + // Number of vectors that fit in this iteration + const uint32_t nvec_access = access_size / VEC_SIZE; + + StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); + StorageBuffer out_buf( + context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "buf_bandwidth_" + memtype_lower; + + auto time = benchmark_on_gpu(shader_name, 10, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {global_x, 1, 1}, + {local_x, 1, 1}, + {SV(NITER), SV(nvec_access), SV(local_x)}, + VK_NULL_HANDLE, + 0, + in_buf.buffer(), + out_buf.buffer()); + }); + + const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; + auto gbps = SIZE_TRANS * 1e-3 / time; + std::cout << memtype << " bandwidth accessing \t" << access_size + << "\tB unique data is \t" << gbps << " \tgbps (\t" << time + << "\tus)" << std::endl; + return gbps; + }; + + double max_bandwidth = 0; + double min_bandwidth = DBL_MAX; + for (uint32_t access_size = VEC_SIZE; access_size < range; access_size *= 2) { + double gbps = bench(access_size); + max_bandwidth = std::max(gbps, max_bandwidth); + min_bandwidth = std::min(gbps, min_bandwidth); + } + + std::cout << "Max" << memtype << "Bandwidth (GB/s)," << max_bandwidth + << std::endl; + std::cout << "Min" << memtype << "Bandwidth (GB/s)," << min_bandwidth + << std::endl; +} + +void buf_bandwidth(const App& app) { + if (!app.enabled("buffer_bandwidth")) { + std::cout << "Skipped Memory Bandwidth" << std::endl; + return; + } + + std::cout << "\n------ Memory Bandwidth ------" << std::endl; + // Maximum memory space read - 128MB + // For regular devices, bandwidth plateaus at less memory than this, so more + // is not needed. + const uint32_t RANGE = app.get_config("buffer_bandwidth", "range"); + _bandwidth(app, "Buffer", RANGE); +} + +void ubo_bandwidth(const App& app) { + if (!app.enabled("ubo_bandwidth")) { + std::cout << "Skipped UBO Bandwidth" << std::endl; + return; + } + + std::cout << "\n------ UBO Bandwidth ------" << std::endl; + const uint32_t RANGE = app.get_config("ubo_bandwidth", "range"); + _bandwidth(app, "UBO", RANGE); +} + +void shared_mem_bandwidth(const App& app) { + if (!app.enabled("shared_bandwidth")) { + std::cout << "Skipped Shared Memory Bandwidth" << std::endl; + return; + } + + std::cout << "\n------ Shared Bandwidth ------" << std::endl; + const uint32_t RANGE = app.max_shared_mem_size; + _bandwidth(app, "Shared", RANGE); +} +} // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h new file mode 100644 index 0000000000..bb8a3371a9 --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/include/textures.h @@ -0,0 +1,207 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include "app.h" +#include "stats.h" +#include "utils.h" + +namespace gpuinfo { + +// Textures are drastically different from buffers in terms of data layout. +// While buffers are a contiguous range of memory, textures are opaque objects +// defined by the vendor and it is possible that nearby points of data are not +// neighboring in memory. Likewise, data points are accessed in +// multi-dimensional patches instead of simple lines. This makes the stride +// method for figuring out the cache line size not applicable. To go around +// this, this experiment runs an increasing amount of threads accessing +// different datapoints in the texture and measures latency. If the cache line +// is big enough to contain all requested data for the amount of threads, +// latency will be low. When there are more threads and hence more data than +// what a single cache line can handle, a second line must be fetched, +// increasing latency in a measurable way. +void tex_cacheline_concurr(const App& app) { + if (!app.enabled("tex_cacheline_concurr")) { + std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl; + return; + } + + const uint32_t TEXEL_WIDTH = 4; + const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH; + + const double COMPENSATE = + app.get_config("tex_cacheline_concurr", "compensate"); + const double THRESHOLD = app.get_config("tex_cacheline_concurr", "threshold"); + + for (int dim = 0; dim < 3; ++dim) { + std::cout << std::endl; + std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim + << ") ------" << std::endl; + + uint32_t NITER; + + const uint32_t IMG_OTHER_EDGE = dim == 0 ? app.max_tex_width + : dim == 1 ? app.max_tex_height + : app.max_tex_depth; + + const uint32_t MAX_NTHREAD = std::min(app.nthread_logic, IMG_OTHER_EDGE); + + auto bench = [&](uint32_t nthread) { + std::vector sizes_whd = { + app.max_tex_width, app.max_tex_height, app.max_tex_depth}; + + auto sizes_nchw = whd_to_nchw(sizes_whd); + + vTensor in_tensor = + api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); + + StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH); + + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim); + + auto time = benchmark_on_gpu(shader_name, 100, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {nthread, 1, 1}, + {nthread, 1, 1}, + {SV(NITER)}, + VK_NULL_HANDLE, + 0, + in_tensor.image(), + out_buf.buffer()); + }); + return time; + }; + + ensure_min_niter(1000, NITER, [&]() { return bench(1); }); + + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + uint32_t nthread = 1; + for (; nthread <= MAX_NTHREAD; ++nthread) { + double time = bench(nthread); + std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time + << std::endl; + + if (dj.push(time)) { + auto max_concurrency = nthread - 1; + std::cout << "TextureCachelineConcurrencyDim" << dim << " (B)," + << max_concurrency * TEXEL_SIZE << std::endl; + break; + } + } + if (nthread >= MAX_NTHREAD) { + std::cout + << "Unable to conclude an optimal texture cacheline concurrency for dim " + << dim << std::endl; + }; + } + + // TODO: Use concurrency information to obtain the cache line size for + // textures as done in https://fburl.com/98xiou3g +} + +void tex_bandwidth(const App& app) { + if (!app.enabled("tex_bandwidth")) { + std::cout << "Skipped Texture Bandwidth" << std::endl; + return; + } + + for (int dim = 0; dim < 3; dim++) { + std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------" + << std::endl; + const uint32_t MAX_SIZE = dim == 0 ? app.max_tex_width + : dim == 1 ? app.max_tex_height + : app.max_tex_depth; + + // rgba, float + const uint32_t VEC_WIDTH = 4; + const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); + const uint32_t NVEC = MAX_SIZE; + + const uint32_t RANGE = NVEC * VEC_SIZE; + + // Cache lines flushed + const uint32_t NFLUSH = app.get_config("tex_bandwidth", "nflush"); + // Number of loop unrolls. Changing this value requires an equal change in + // tex_bandwidth.yaml + const uint32_t NUNROLL = app.get_config("tex_bandwidth", "nunroll"); + // Number of iterations. Increasing this value reduces noise in exchange + // for higher latency. + const uint32_t NITER = app.get_config("tex_bandwidth", "niter"); + // Number of memory reads per thread + const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; + // Number of threads needed to read all texells + const uint32_t NTHREAD = NVEC; + // Occupy all threads + const uint32_t local_x = app.nthread_logic; + // Ensure that global is a multiple of local, and distribute across all + // SMs + const uint32_t global_x = + (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH; + + auto shader_name = "tex_bandwidth_" + std::to_string(dim); + + std::vector sizes_whd = {MAX_SIZE, 1, 1}; + if (dim == 1) { + sizes_whd = {1, MAX_SIZE, 1}; + } else if (dim == 2) { + sizes_whd = {1, 1, MAX_SIZE}; + } + auto sizes_nchw = whd_to_nchw(sizes_whd); + + vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); + + auto bench = [&](uint32_t access_size, uint32_t dim) { + // Number of texels that fit in this iteration + const uint32_t ntexel_access = access_size / VEC_SIZE; + + StorageBuffer out_buf( + context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto time = benchmark_on_gpu(shader_name, 10, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {global_x, 1, 1}, + {local_x, 1, 1}, + {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)}, + VK_NULL_HANDLE, + 0, + in_tensor.image(), + out_buf.buffer()); + }); + + const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; + double gbps = SIZE_TRANS * 1e-3 / time; + std::cout << "Texture bandwidth accessing \t" << access_size + << "\tB unique data is \t" << gbps << " \tgbps (\t" << time + << "\tus)" << std::endl; + return gbps; + }; + + double max_bandwidth = 0; + double min_bandwidth = DBL_MAX; + for (uint32_t access_size = VEC_SIZE; access_size < RANGE; + access_size *= 2) { + double gbps = bench(access_size, dim); + max_bandwidth = std::max(gbps, max_bandwidth); + min_bandwidth = std::min(gbps, min_bandwidth); + } + + std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth + << std::endl; + std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth + << std::endl; + } +} +} // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/utils.h b/backends/vulkan/tools/gpuinfo/include/utils.h index 231fb32c5a..887cb443ef 100644 --- a/backends/vulkan/tools/gpuinfo/include/utils.h +++ b/backends/vulkan/tools/gpuinfo/include/utils.h @@ -54,6 +54,15 @@ void ensure_min_niter( } } +std::vector whd_to_nchw(std::vector sizes) { + const int64_t W = sizes[0]; + const int64_t H = sizes[1]; + const int64_t D = sizes[2]; + + // Channels-packed: {W, H, D} = {W, H, (C / 4) * N} + return {1, D * 4, H, W}; +} + cl_platform_id get_cl_platform_id() { cl_uint nplatform_id; clGetPlatformIDs(0, nullptr, &nplatform_id); diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp deleted file mode 100644 index 2b1621db62..0000000000 --- a/backends/vulkan/tools/gpuinfo/src/app.cpp +++ /dev/null @@ -1,790 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include -#include - -#include "stats.h" -#include "utils.h" - -using namespace vkapi; - -class App { - private: - size_t buf_cache_size_; - uint32_t max_shared_mem_size_; - uint32_t sm_count_; - uint32_t nthread_logic_; - uint32_t subgroup_size_; - uint32_t max_tex_width_; - uint32_t max_tex_height_; - uint32_t max_tex_depth_; - folly::dynamic config_; - - std::vector _whd_to_nchw(std::vector sizes) { - const int64_t W = sizes[0]; - const int64_t H = sizes[1]; - const int64_t D = sizes[2]; - - // Channels-packed: {W, H, D} = {W, H, (C / 4) * N} - return {1, D * 4, H, W}; - } - - float _get_config(const std::string& test, const std::string& key) { - if (config_[test].empty()) { - throw std::runtime_error("Missing config for " + test); - } - - if (!config_[test][key].isNumber()) { - throw std::runtime_error( - "Config for " + test + "." + key + " is not a number"); - } - - float value; - if (config_[test][key].isDouble()) { - value = config_[test][key].getDouble(); - } else { - value = config_[test][key].getInt(); - } - - std::cout << "Read value for " << test << "." << key << " = " << value - << std::endl; - return value; - } - - bool _enabled(const std::string& test) { - if (config_.empty() || config_[test].empty() || - !config_[test]["enabled"].isBool()) { - return true; - } - return config_[test]["enabled"].getBool(); - } - - public: - App() { - context()->initialize_querypool(); - - std::cout << context()->adapter_ptr()->stringize() << std::endl - << std::endl; - - auto cl_device = get_cl_device(); - - sm_count_ = cl_device.getInfo(); - nthread_logic_ = cl_device.getInfo(); - buf_cache_size_ = cl_device.getInfo(); - max_shared_mem_size_ = cl_device.getInfo(); - max_tex_width_ = cl_device.getInfo(); - max_tex_height_ = cl_device.getInfo(); - max_tex_depth_ = cl_device.getInfo(); - - VkPhysicalDeviceSubgroupProperties subgroup_props{}; - VkPhysicalDeviceProperties2 props2{}; - - props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; - props2.pNext = &subgroup_props; - subgroup_props.sType = - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES; - vkGetPhysicalDeviceProperties2( - context()->adapter_ptr()->physical_handle(), &props2); - subgroup_size_ = subgroup_props.subgroupSize; - - std::cout << std::endl; - std::cout << "SM count," << sm_count_ << std::endl; - std::cout << "Logic Thread Count," << nthread_logic_ << std::endl; - std::cout << "Cache Size," << buf_cache_size_ << std::endl; - std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl; - std::cout << "SubGroup Size," << subgroup_size_ << std::endl; - std::cout << "MaxTexWidth," << max_tex_width_ << std::endl; - std::cout << "MaxTexHeight," << max_tex_height_ << std::endl; - std::cout << "MaxTexDepth," << max_tex_depth_ << std::endl; - } - - void load_config(std::string file_path) { - std::ifstream file(file_path); - std::stringstream buffer; - buffer << file.rdbuf(); - const std::string json_str = buffer.str(); - if (json_str.empty()) { - throw std::runtime_error( - "Failed to read config file from " + file_path + "."); - } - config_ = folly::parseJson(json_str); - } - - void reg_count() { - if (!_enabled("reg_count")) { - std::cout << "Skipped Register Count" << std::endl; - return; - } - - std::cout << std::endl; - std::cout << "------ Register Count ------" << std::endl; - const uint32_t NREG_MIN = 1; - const uint32_t NREG_MAX = 512; - const uint32_t NREG_STEP = 1; - - const double COMPENSATE = _get_config("reg_count", "compensate"); - const double THRESHOLD = _get_config("reg_count", "threshold"); - - const uint32_t NGRP_MIN = 1; - const uint32_t NGRP_MAX = 64; - const uint32_t NGRP_STEP = 1; - - uint32_t NITER; - - auto bench = [&](uint32_t ngrp, uint32_t nreg) { - StorageBuffer buffer(context(), vkapi::kFloat, 1); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "reg_count_" + std::to_string(nreg); - - auto time = benchmark_on_gpu(shader_name, 100, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {1, ngrp, 1}, - {1, 1, 1}, - {SV(NITER)}, - VK_NULL_HANDLE, - 0, - buffer.buffer()); - }); - return time; - }; - - std::cout << "Calculating NITER..." << std::endl; - ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); }); - std::cout << "NITER," << NITER << std::endl; - - uint32_t nreg_max; - - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - uint32_t nreg = NREG_MIN; - for (; nreg <= NREG_MAX; nreg += NREG_STEP) { - double time = bench(1, nreg); - std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time - << std::endl; - if (dj.push(time)) { - nreg -= NREG_STEP; - nreg_max = nreg; - break; - } - } - if (nreg >= NREG_MAX) { - std::cout << "Unable to conclude a maximal register count" << std::endl; - nreg_max = NREG_STEP; - } else { - std::cout << nreg_max << " registers are available at most" << std::endl; - } - - auto find_ngrp_by_nreg = [&](const uint32_t nreg) { - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) { - auto time = bench(ngrp, nreg); - std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp - << ", time=" << time << " us" << std::endl; - - if (dj.push(time)) { - ngrp -= NGRP_STEP; - std::cout << "Using " << nreg << " registers can have " << ngrp - << " concurrent single-thread workgroups" << std::endl; - return ngrp; - } - } - std::cout - << "Unable to conclude a maximum number of concurrent single-thread workgroups when " - << nreg << " registers are occupied" << std::endl; - return (uint32_t)1; - }; - - uint32_t ngrp_full, ngrp_half; - ngrp_full = find_ngrp_by_nreg(nreg_max); - ngrp_half = find_ngrp_by_nreg(nreg_max / 2); - - std::string reg_ty; - - if (ngrp_full * 1.5 < ngrp_half) { - std::cout << "All physical threads in an sm share " << nreg_max - << " registers" << std::endl; - reg_ty = "Pooled"; - - } else { - std::cout << "Each physical thread has " << nreg_max << " registers" - << std::endl; - reg_ty = "Dedicated"; - } - - std::cout << std::endl << std::endl; - std::cout << "NITER," << NITER << std::endl; - std::cout << "Max registers," << nreg_max << std::endl; - std::cout << "Concurrent full single thread workgroups," << ngrp_full - << std::endl; - std::cout << "Concurrent half single thread workgroups," << ngrp_half - << std::endl; - std::cout << "Register type," << reg_ty << std::endl; - } - - void buf_cacheline_size() { - if (!_enabled("buf_cacheline_size")) { - std::cout << "Skipped Buffer Cacheline Size" << std::endl; - return; - } - - std::cout << std::endl; - std::cout << "------ Buffer Cacheline Size ------" << std::endl; - - const double COMPENSATE = _get_config("buf_cacheline_size", "compensate"); - const double THRESHOLD = _get_config("buf_cacheline_size", "threshold"); - - const uint32_t PITCH = buf_cache_size_ / nthread_logic_; - const uint32_t BUF_SIZE = buf_cache_size_; - const uint32_t MAX_STRIDE = PITCH; - - uint32_t NITER; - - auto bench = [&](int stride) { - StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); - StorageBuffer out_buf(context(), vkapi::kFloat, 1); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "buf_cacheline_size"; - - auto time = benchmark_on_gpu(shader_name, 100, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {nthread_logic_, 1, 1}, - {nthread_logic_, 1, 1}, - {SV(NITER), SV(stride), SV(PITCH)}, - VK_NULL_HANDLE, - 0, - in_buf.buffer(), - out_buf.buffer()); - }); - return time; - }; - - ensure_min_niter(1000, NITER, [&]() { return bench(1); }); - - uint32_t cacheline_size; - - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - uint32_t stride = 1; - for (; stride <= MAX_STRIDE; ++stride) { - double time = bench(stride); - std::cout << "Testing stride=\t" << stride << "\t, time=\t" << time - << std::endl; - - if (dj.push(time)) { - cacheline_size = stride * sizeof(float); - break; - } - } - if (stride >= MAX_STRIDE) { - std::cout << "Unable to conclude a top level buffer cacheline size." - << std::endl; - cacheline_size = MAX_STRIDE * sizeof(float); - } - - std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl; - } - - // Textures are drastically different from buffers in terms of data layout. - // While buffers are a contiguous range of memory, textures are opaque objects - // defined by the vendor and it is possible that nearby points of data are not - // neighboring in memory. Likewise, data points are accessed in - // multi-dimensional patches instead of simple lines. This makes the stride - // method for figuring out the cache line size not applicable. To go around - // this, this experiment runs an increasing amount of threads accessing - // different datapoints in the texture and measures latency. If the cache line - // is big enough to contain all requested data for the amount of threads, - // latency will be low. When there are more threads and hence more data than - // what a single cache line can handle, a second line must be fetched, - // increasing latency in a measurable way. - void tex_cacheline_concurr() { - if (!_enabled("tex_cacheline_concurr")) { - std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl; - return; - } - - const uint32_t TEXEL_WIDTH = 4; - const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH; - - const double COMPENSATE = - _get_config("tex_cacheline_concurr", "compensate"); - const double THRESHOLD = _get_config("tex_cacheline_concurr", "threshold"); - - for (int dim = 0; dim < 3; ++dim) { - std::cout << std::endl; - std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim - << ") ------" << std::endl; - - uint32_t NITER; - - const uint32_t IMG_OTHER_EDGE = dim == 0 ? max_tex_width_ - : dim == 1 ? max_tex_height_ - : max_tex_depth_; - - const uint32_t MAX_NTHREAD = std::min(nthread_logic_, IMG_OTHER_EDGE); - - auto bench = [&](uint32_t nthread) { - std::vector sizes_whd = { - max_tex_width_, max_tex_height_, max_tex_depth_}; - - auto sizes_nchw = _whd_to_nchw(sizes_whd); - - vTensor in_tensor = - api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); - - StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH); - - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim); - - auto time = benchmark_on_gpu(shader_name, 100, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {nthread, 1, 1}, - {nthread, 1, 1}, - {SV(NITER)}, - VK_NULL_HANDLE, - 0, - in_tensor.image(), - out_buf.buffer()); - }); - return time; - }; - - ensure_min_niter(1000, NITER, [&]() { return bench(1); }); - - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - uint32_t nthread = 1; - for (; nthread <= MAX_NTHREAD; ++nthread) { - double time = bench(nthread); - std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time - << std::endl; - - if (dj.push(time)) { - auto max_concurrency = nthread - 1; - std::cout << "TextureCachelineConcurrencyDim" << dim << " (B)," - << max_concurrency * TEXEL_SIZE << std::endl; - break; - } - } - if (nthread >= MAX_NTHREAD) { - std::cout - << "Unable to conclude an optimal texture cacheline concurrency for dim " - << dim << std::endl; - }; - } - - // TODO: Use concurrency information to obtain the cache line size for - // textures as done in https://fburl.com/98xiou3g - } - - private: - void _bandwidth(std::string memtype, uint32_t range) { - auto memtype_lower = memtype; - std::transform( - memtype_lower.begin(), - memtype_lower.end(), - memtype_lower.begin(), - [](unsigned char c) { return std::tolower(c); }); - - auto test_name = memtype_lower + "_bandwidth"; - - // Cache lines flushed - const uint32_t NFLUSH = _get_config(test_name, "nflush"); - // Number of loop unrolls. Changing this value requires an equal change in - // buf_bandwidth.yaml - const uint32_t NUNROLL = _get_config(test_name, "nunroll"); - // Number of iterations. Increasing this value reduces noise in exchange for - // higher latency. - const uint32_t NITER = _get_config(test_name, "niter"); - // Vector dimensions (vec4) - const uint32_t VEC_WIDTH = 4; - const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); - // Number of vectors that fit in the selected memory space - const uint32_t NVEC = range / VEC_SIZE; - // Number of memory reads per thread - const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; - // Number of threads needed to read al l vectors - // The thread count doesn't divide by thread workload in shared memory - // because of the limited memory size. - const uint32_t NTHREAD = - memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD; - // Occupy all threads - const uint32_t local_x = nthread_logic_; - // Ensure that global is a multiple of local, and distribute across all SMs - const uint32_t global_x = - (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH; - - auto bench = [&](uint32_t access_size) { - // Number of vectors that fit in this iteration - const uint32_t nvec_access = access_size / VEC_SIZE; - - StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); - StorageBuffer out_buf( - context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "buf_bandwidth_" + memtype_lower; - - auto time = benchmark_on_gpu(shader_name, 10, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {global_x, 1, 1}, - {local_x, 1, 1}, - {SV(NITER), SV(nvec_access), SV(local_x)}, - VK_NULL_HANDLE, - 0, - in_buf.buffer(), - out_buf.buffer()); - }); - - const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; - auto gbps = SIZE_TRANS * 1e-3 / time; - std::cout << memtype << " bandwidth accessing \t" << access_size - << "\tB unique data is \t" << gbps << " \tgbps (\t" << time - << "\tus)" << std::endl; - return gbps; - }; - - double max_bandwidth = 0; - double min_bandwidth = DBL_MAX; - for (uint32_t access_size = VEC_SIZE; access_size < range; - access_size *= 2) { - double gbps = bench(access_size); - max_bandwidth = std::max(gbps, max_bandwidth); - min_bandwidth = std::min(gbps, min_bandwidth); - } - - std::cout << "Max" << memtype << "Bandwidth (GB/s)," << max_bandwidth - << std::endl; - std::cout << "Min" << memtype << "Bandwidth (GB/s)," << min_bandwidth - << std::endl; - } - - public: - void buf_bandwidth() { - if (!_enabled("buffer_bandwidth")) { - std::cout << "Skipped Memory Bandwidth" << std::endl; - return; - } - - std::cout << "\n------ Memory Bandwidth ------" << std::endl; - // Maximum memory space read - 128MB - // For regular devices, bandwidth plateaus at less memory than this, so more - // is not needed. - const uint32_t RANGE = _get_config("buffer_bandwidth", "range"); - _bandwidth("Buffer", RANGE); - } - - void ubo_bandwidth() { - if (!_enabled("ubo_bandwidth")) { - std::cout << "Skipped UBO Bandwidth" << std::endl; - return; - } - - std::cout << "\n------ UBO Bandwidth ------" << std::endl; - const uint32_t RANGE = _get_config("ubo_bandwidth", "range"); - _bandwidth("UBO", RANGE); - } - - void shared_mem_bandwidth() { - if (!_enabled("shared_mem_bandwidth")) { - std::cout << "Skipped Shared Memory Bandwidth" << std::endl; - return; - } - - std::cout << "\n------ Shared Bandwidth ------" << std::endl; - const uint32_t RANGE = max_shared_mem_size_; - _bandwidth("Shared", RANGE); - } - - void tex_bandwidth() { - if (!_enabled("tex_bandwidth")) { - std::cout << "Skipped Texture Bandwidth" << std::endl; - return; - } - - for (int dim = 0; dim < 3; dim++) { - std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------" - << std::endl; - const uint32_t MAX_SIZE = dim == 0 ? max_tex_width_ - : dim == 1 ? max_tex_height_ - : max_tex_depth_; - - // rgba, float - const uint32_t VEC_WIDTH = 4; - const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); - const uint32_t NVEC = MAX_SIZE; - - const uint32_t RANGE = NVEC * VEC_SIZE; - - // Cache lines flushed - const uint32_t NFLUSH = _get_config("tex_bandwidth", "nflush"); - // Number of loop unrolls. Changing this value requires an equal change in - // tex_bandwidth.yaml - const uint32_t NUNROLL = _get_config("tex_bandwidth", "nunroll"); - // Number of iterations. Increasing this value reduces noise in exchange - // for higher latency. - const uint32_t NITER = _get_config("tex_bandwidth", "niter"); - // Number of memory reads per thread - const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; - // Number of threads needed to read all texells - const uint32_t NTHREAD = NVEC; - // Occupy all threads - const uint32_t local_x = nthread_logic_; - // Ensure that global is a multiple of local, and distribute across all - // SMs - const uint32_t global_x = - (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH; - - auto shader_name = "tex_bandwidth_" + std::to_string(dim); - - std::vector sizes_whd = {MAX_SIZE, 1, 1}; - if (dim == 1) { - sizes_whd = {1, MAX_SIZE, 1}; - } else if (dim == 2) { - sizes_whd = {1, 1, MAX_SIZE}; - } - auto sizes_nchw = _whd_to_nchw(sizes_whd); - - vTensor in_tensor = - api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); - - auto bench = [&](uint32_t access_size, uint32_t dim) { - // Number of texels that fit in this iteration - const uint32_t ntexel_access = access_size / VEC_SIZE; - - StorageBuffer out_buf( - context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto time = benchmark_on_gpu(shader_name, 10, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {global_x, 1, 1}, - {local_x, 1, 1}, - {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)}, - VK_NULL_HANDLE, - 0, - in_tensor.image(), - out_buf.buffer()); - }); - - const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; - double gbps = SIZE_TRANS * 1e-3 / time; - std::cout << "Texture bandwidth accessing \t" << access_size - << "\tB unique data is \t" << gbps << " \tgbps (\t" << time - << "\tus)" << std::endl; - return gbps; - }; - - double max_bandwidth = 0; - double min_bandwidth = DBL_MAX; - for (uint32_t access_size = VEC_SIZE; access_size < RANGE; - access_size *= 2) { - double gbps = bench(access_size, dim); - max_bandwidth = std::max(gbps, max_bandwidth); - min_bandwidth = std::min(gbps, min_bandwidth); - } - - std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth - << std::endl; - std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth - << std::endl; - } - } - - // Warp size is a difficult metric to obtain because the hardware limitations - // do not always coincide with the way the SM divides the workload. For - // instance, the hardware can have a warp size of 64 threads, but an SM might - // be able to simulate concurrency of 128 threads with a single scheduler. - - // Because of this, it is important to measure the warp size different ways, - // that can evidence both the physical limitations of the hardware, and the - // actual behavior of the driver. - - // Additionally,the SM can behave in two different ways when the assigned - // workload is smaller than the warp size. - - // In Case 1, like ARM Mali, the SM can assign dummy workloads to fill empty - // threads and maintain a uniform workload. - - // In Case 2, like in Adreno, the driver might decide to pack multiple works - // together and dispatch them at once. - void warp_size(bool verbose = false) { - if (!_enabled("warp_size")) { - std::cout << "Skipped Warp Size" << std::endl; - return; - } - - std::cout << "\n------ Warp Size ------" << std::endl; - - // Method A: Stress test with a kernel that uses complex ALU operations like - // integer division to avoid latency hiding. Increase the number of threads - // until a jump in latency is detected. - - // This timing-based method helps us identify physical warp sizes. It also - // helps with Case 2, when threads of multiple warps are managed by the same - // scheduler at the same time. - const double COMPENSATE = _get_config("warp_size", "compensate"); - const double THRESHOLD = _get_config("warp_size", "threshold"); - - uint32_t NITER; - - auto bench = [&](uint32_t nthread) { - StorageBuffer out_buf(context(), vkapi::kInt, nthread_logic_); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "warp_size_physical"; - - auto time = benchmark_on_gpu(shader_name, 10, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - // Large number of work groups selected to potentially saturate all - // ALUs and thus have a better baseline for comparison. - {nthread, 1024, 1}, - {nthread, 1, 1}, - {SV(NITER)}, - VK_NULL_HANDLE, - 0, - out_buf.buffer()); - }); - - return time; - }; - - ensure_min_niter(1000, NITER, [&]() { return bench(1); }); - - uint32_t warp_size = subgroup_size_; - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - - // We increase the number of threads until we hit a jump in the data. - uint32_t nthread = 1; - for (; nthread <= nthread_logic_; ++nthread) { - double time = bench(nthread); - std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)" - << std::endl; - if (dj.push(time)) { - warp_size = nthread - 1; - break; - } - } - if (nthread >= nthread_logic_) { - std::cout - << "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size" - << std::endl; - } - - // Method B: Let all the threads in a warp race and atomically fetch-add - // a counter, then store the counter values to the output buffer in the - // scheduling order of these threads. If all the order numbers follow an - // ascending order, then the threads are likely executing within a warp. - // Threads in different warps are not managed by the same scheduler, so they - // would race for a same ID out of order, unaware of each other. - - // This method evidences the actual driver behavior when running - // concurrency, regardless of the physical limitations of the hardware. - - // Likewise, this method helps us identify warp sizes when the SM - // sub-divides its ALUs into independent groups, like the three execution - // engines in a Mali G76 core. It helps warp-probing in Case 1 because it - // doesn't depend on kernel timing, so the extra wait time doesn't lead to - // inaccuracy. - auto bench_sm = [&](uint32_t nthread) { - StorageBuffer out_buf(context(), vkapi::kInt, nthread_logic_); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "warp_size_scheduler"; - - benchmark_on_gpu(shader_name, 1, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {nthread, 1, 1}, - {nthread, 1, 1}, - {}, - VK_NULL_HANDLE, - 0, - out_buf.buffer()); - }); - - std::vector data(nthread_logic_); - copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes()); - - if (verbose) { - std::stringstream ss; - for (auto j = 0; j < nthread; ++j) { - ss << data[j] << " "; - } - std::cout << ss.str() << std::endl; - } - - // Check until which point is the data in ascending order. - int32_t last = -1; - int32_t j = 0; - for (; j < nthread; ++j) { - if (last >= data[j]) { - break; - } - last = data[j]; - } - - return j; - }; - - // Test increasing sizes until the data is no longer in ascending order. - uint32_t warp_size_scheduler = warp_size; - int i = 1; - for (; i <= nthread_logic_; ++i) { - uint32_t nascend = bench_sm(i); - if (nascend != i) { - warp_size_scheduler = nascend; - break; - } - } - if (i > nthread_logic_) { - std::cout << "Unable to conclude an SM Warp Size." << std::endl; - } - - std::cout << "PhysicalWarpSize," << warp_size << std::endl; - std::cout << "SMWarpSize," << warp_size_scheduler << std::endl; - } -}; - -int main(int argc, const char** argv) { - App app; - - std::string file_path = "config.json"; - if (argc > 1) { - file_path = argv[1]; - }; - app.load_config(file_path); - - app.reg_count(); - app.buf_cacheline_size(); - app.buf_bandwidth(); - app.ubo_bandwidth(); - app.shared_mem_bandwidth(); - app.warp_size(); - app.tex_bandwidth(); - app.tex_cacheline_concurr(); - - return 0; -} diff --git a/backends/vulkan/tools/gpuinfo/src/main.cpp b/backends/vulkan/tools/gpuinfo/src/main.cpp new file mode 100644 index 0000000000..f0e29aaf1a --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/src/main.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "app.h" +#include "architecture.h" +#include "buffers.h" +#include "textures.h" + +using namespace vkapi; + +int main(int argc, const char** argv) { + gpuinfo::App app; + + std::string file_path = "config.json"; + if (argc > 1) { + file_path = argv[1]; + }; + app.load_config(file_path); + + // Architecture + gpuinfo::reg_count(app); + gpuinfo::warp_size(app); + + // Buffers + gpuinfo::buf_cacheline_size(app); + gpuinfo::buf_bandwidth(app); + gpuinfo::ubo_bandwidth(app); + gpuinfo::shared_mem_bandwidth(app); + + // Textures + gpuinfo::tex_bandwidth(app); + gpuinfo::tex_cacheline_concurr(app); + + return 0; +}