diff --git a/backends/vulkan/tools/gpuinfo/include/app.h b/backends/vulkan/tools/gpuinfo/include/app.h new file mode 100644 index 00000000000..21e3258280d --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/include/app.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +#include "utils.h" + +namespace gpuinfo { + +class App { + private: + folly::dynamic config_; + + public: + size_t buf_cache_size; + uint32_t max_shared_mem_size; + uint32_t sm_count; + uint32_t nthread_logic; + uint32_t subgroup_size; + uint32_t max_tex_width; + uint32_t max_tex_height; + uint32_t max_tex_depth; + + App() { + { + context()->initialize_querypool(); + + std::cout << context()->adapter_ptr()->stringize() << std::endl + << std::endl; + + auto cl_device = get_cl_device(); + + sm_count = cl_device.getInfo(); + nthread_logic = cl_device.getInfo(); + buf_cache_size = cl_device.getInfo(); + max_shared_mem_size = cl_device.getInfo(); + max_tex_width = cl_device.getInfo(); + max_tex_height = cl_device.getInfo(); + max_tex_depth = cl_device.getInfo(); + + VkPhysicalDeviceSubgroupProperties subgroup_props{}; + VkPhysicalDeviceProperties2 props2{}; + + props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; + props2.pNext = &subgroup_props; + subgroup_props.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES; + vkGetPhysicalDeviceProperties2( + context()->adapter_ptr()->physical_handle(), &props2); + subgroup_size = subgroup_props.subgroupSize; + + std::cout << std::endl; + std::cout << "SM count," << sm_count << std::endl; + std::cout << "Logic Thread Count," << nthread_logic << std::endl; + std::cout << "Cache Size," << buf_cache_size << std::endl; + std::cout << "Shared Memory Size," << max_shared_mem_size << std::endl; + std::cout << "SubGroup Size," << subgroup_size << std::endl; + std::cout << "MaxTexWidth," << max_tex_width << std::endl; + std::cout << "MaxTexHeight," << max_tex_height << std::endl; + std::cout << "MaxTexDepth," << max_tex_depth << std::endl; + } + } + + float get_config(const std::string& test, const std::string& key) { + if (config_[test].empty()) { + throw std::runtime_error("Missing config for " + test); + } + + if (!config_[test][key].isNumber()) { + throw std::runtime_error( + "Config for " + test + "." + key + " is not a number"); + } + + float value; + if (config_[test][key].isDouble()) { + value = config_[test][key].getDouble(); + } else { + value = config_[test][key].getInt(); + } + + std::cout << "Read value for " << test << "." << key << " = " << value + << std::endl; + return value; + } + + bool enabled(const std::string& test) { + if (config_.empty() || config_[test].empty() || + !config_[test]["enabled"].isBool()) { + return true; + } + return config_[test]["enabled"].getBool(); + } + + void load_config(std::string file_path) { + std::ifstream file(file_path); + std::stringstream buffer; + buffer << file.rdbuf(); + const std::string json_str = buffer.str(); + if (json_str.empty()) { + throw std::runtime_error( + "Failed to read config file from " + file_path + "."); + } + config_ = folly::parseJson(json_str); + } +}; +} // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h new file mode 100644 index 00000000000..63f0786a805 --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/include/architecture.h @@ -0,0 +1,287 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include "app.h" +#include "stats.h" +#include "utils.h" + +using namespace vkapi; + +namespace gpuinfo { + +void reg_count(App& app) { + if (!app.enabled("reg_count")) { + std::cout << "Skipped Register Count" << std::endl; + return; + } + + std::cout << std::endl; + std::cout << "------ Register Count ------" << std::endl; + const uint32_t NREG_MIN = 1; + const uint32_t NREG_MAX = 512; + const uint32_t NREG_STEP = 1; + + const double COMPENSATE = app.get_config("reg_count", "compensate"); + const double THRESHOLD = app.get_config("reg_count", "threshold"); + + const uint32_t NGRP_MIN = 1; + const uint32_t NGRP_MAX = 64; + const uint32_t NGRP_STEP = 1; + + uint32_t NITER; + + auto bench = [&](uint32_t ngrp, uint32_t nreg) { + StorageBuffer buffer(context(), vkapi::kFloat, 1); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "reg_count_" + std::to_string(nreg); + + auto time = benchmark_on_gpu(shader_name, 100, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {1, ngrp, 1}, + {1, 1, 1}, + {SV(NITER)}, + VK_NULL_HANDLE, + 0, + buffer.buffer()); + }); + return time; + }; + + std::cout << "Calculating NITER..." << std::endl; + ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); }); + std::cout << "NITER," << NITER << std::endl; + + uint32_t nreg_max; + + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + uint32_t nreg = NREG_MIN; + for (; nreg <= NREG_MAX; nreg += NREG_STEP) { + double time = bench(1, nreg); + std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << std::endl; + if (dj.push(time)) { + nreg -= NREG_STEP; + nreg_max = nreg; + break; + } + } + if (nreg >= NREG_MAX) { + std::cout << "Unable to conclude a maximal register count" << std::endl; + nreg_max = NREG_STEP; + } else { + std::cout << nreg_max << " registers are available at most" << std::endl; + } + + auto find_ngrp_by_nreg = [&](const uint32_t nreg) { + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) { + auto time = bench(ngrp, nreg); + std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp + << ", time=" << time << " us" << std::endl; + + if (dj.push(time)) { + ngrp -= NGRP_STEP; + std::cout << "Using " << nreg << " registers can have " << ngrp + << " concurrent single-thread workgroups" << std::endl; + return ngrp; + } + } + std::cout + << "Unable to conclude a maximum number of concurrent single-thread workgroups when " + << nreg << " registers are occupied" << std::endl; + return (uint32_t)1; + }; + + uint32_t ngrp_full, ngrp_half; + ngrp_full = find_ngrp_by_nreg(nreg_max); + ngrp_half = find_ngrp_by_nreg(nreg_max / 2); + + std::string reg_ty; + + if (ngrp_full * 1.5 < ngrp_half) { + std::cout << "All physical threads in an sm share " << nreg_max + << " registers" << std::endl; + reg_ty = "Pooled"; + + } else { + std::cout << "Each physical thread has " << nreg_max << " registers" + << std::endl; + reg_ty = "Dedicated"; + } + + std::cout << std::endl << std::endl; + std::cout << "NITER," << NITER << std::endl; + std::cout << "Max registers," << nreg_max << std::endl; + std::cout << "Concurrent full single thread workgroups," << ngrp_full + << std::endl; + std::cout << "Concurrent half single thread workgroups," << ngrp_half + << std::endl; + std::cout << "Register type," << reg_ty << std::endl; +} + +// Warp size is a difficult metric to obtain because the hardware limitations +// do not always coincide with the way the SM divides the workload. For +// instance, the hardware can have a warp size of 64 threads, but an SM might +// be able to simulate concurrency of 128 threads with a single scheduler. + +// Because of this, it is important to measure the warp size different ways, +// that can evidence both the physical limitations of the hardware, and the +// actual behavior of the driver. + +// Additionally,the SM can behave in two different ways when the assigned +// workload is smaller than the warp size. + +// In Case 1, like ARM Mali, the SM can assign dummy workloads to fill empty +// threads and maintain a uniform workload. + +// In Case 2, like in Adreno, the driver might decide to pack multiple works +// together and dispatch them at once. +void warp_size(App& app, bool verbose = false) { + if (!app.enabled("warp_size")) { + std::cout << "Skipped Warp Size" << std::endl; + return; + } + + std::cout << "\n------ Warp Size ------" << std::endl; + + // Method A: Stress test with a kernel that uses complex ALU operations like + // integer division to avoid latency hiding. Increase the number of threads + // until a jump in latency is detected. + + // This timing-based method helps us identify physical warp sizes. It also + // helps with Case 2, when threads of multiple warps are managed by the same + // scheduler at the same time. + const double COMPENSATE = app.get_config("warp_size", "compensate"); + const double THRESHOLD = app.get_config("warp_size", "threshold"); + + uint32_t NITER; + + auto bench = [&](uint32_t nthread) { + StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "warp_size_physical"; + + auto time = benchmark_on_gpu(shader_name, 10, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + // Large number of work groups selected to potentially saturate all + // ALUs and thus have a better baseline for comparison. + {nthread, 1024, 1}, + {nthread, 1, 1}, + {SV(NITER)}, + VK_NULL_HANDLE, + 0, + out_buf.buffer()); + }); + + return time; + }; + + ensure_min_niter(1000, NITER, [&]() { return bench(1); }); + + uint32_t warp_size = app.subgroup_size; + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + + // We increase the number of threads until we hit a jump in the data. + uint32_t nthread = 1; + for (; nthread <= app.nthread_logic; ++nthread) { + double time = bench(nthread); + std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)" + << std::endl; + if (dj.push(time)) { + warp_size = nthread - 1; + break; + } + } + if (nthread >= app.nthread_logic) { + std::cout + << "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size" + << std::endl; + } + + // Method B: Let all the threads in a warp race and atomically fetch-add + // a counter, then store the counter values to the output buffer in the + // scheduling order of these threads. If all the order numbers follow an + // ascending order, then the threads are likely executing within a warp. + // Threads in different warps are not managed by the same scheduler, so they + // would race for a same ID out of order, unaware of each other. + + // This method evidences the actual driver behavior when running + // concurrency, regardless of the physical limitations of the hardware. + + // Likewise, this method helps us identify warp sizes when the SM + // sub-divides its ALUs into independent groups, like the three execution + // engines in a Mali G76 core. It helps warp-probing in Case 1 because it + // doesn't depend on kernel timing, so the extra wait time doesn't lead to + // inaccuracy. + auto bench_sm = [&](uint32_t nthread) { + StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "warp_size_scheduler"; + + benchmark_on_gpu(shader_name, 1, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {nthread, 1, 1}, + {nthread, 1, 1}, + {}, + VK_NULL_HANDLE, + 0, + out_buf.buffer()); + }); + + std::vector data(app.nthread_logic); + copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes()); + + if (verbose) { + std::stringstream ss; + for (auto j = 0; j < nthread; ++j) { + ss << data[j] << " "; + } + std::cout << ss.str() << std::endl; + } + + // Check until which point is the data in ascending order. + int32_t last = -1; + int32_t j = 0; + for (; j < nthread; ++j) { + if (last >= data[j]) { + break; + } + last = data[j]; + } + + return j; + }; + + // Test increasing sizes until the data is no longer in ascending order. + uint32_t warp_size_scheduler = warp_size; + int i = 1; + for (; i <= app.nthread_logic; ++i) { + uint32_t nascend = bench_sm(i); + if (nascend != i) { + warp_size_scheduler = nascend; + break; + } + } + if (i > app.nthread_logic) { + std::cout << "Unable to conclude an SM Warp Size." << std::endl; + } + + std::cout << "PhysicalWarpSize," << warp_size << std::endl; + std::cout << "SMWarpSize," << warp_size_scheduler << std::endl; +} +}; // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h new file mode 100644 index 00000000000..7f108a3e13d --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/include/buffers.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include "app.h" +#include "stats.h" +#include "utils.h" + +using namespace vkapi; + +namespace gpuinfo { + +void buf_cacheline_size(App& app) { + if (!app.enabled("buf_cacheline_size")) { + std::cout << "Skipped Buffer Cacheline Size" << std::endl; + return; + } + + std::cout << std::endl; + std::cout << "------ Buffer Cacheline Size ------" << std::endl; + + const double COMPENSATE = app.get_config("buf_cacheline_size", "compensate"); + const double THRESHOLD = app.get_config("buf_cacheline_size", "threshold"); + + const uint32_t PITCH = app.buf_cache_size / app.nthread_logic; + const uint32_t BUF_SIZE = app.buf_cache_size; + const uint32_t MAX_STRIDE = PITCH; + + uint32_t NITER; + + auto bench = [&](int stride) { + StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); + StorageBuffer out_buf(context(), vkapi::kFloat, 1); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "buf_cacheline_size"; + + auto time = benchmark_on_gpu(shader_name, 100, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {app.nthread_logic, 1, 1}, + {app.nthread_logic, 1, 1}, + {SV(NITER), SV(stride), SV(PITCH)}, + VK_NULL_HANDLE, + 0, + in_buf.buffer(), + out_buf.buffer()); + }); + return time; + }; + + ensure_min_niter(1000, NITER, [&]() { return bench(1); }); + + uint32_t cacheline_size; + + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + uint32_t stride = 1; + for (; stride <= MAX_STRIDE; ++stride) { + double time = bench(stride); + std::cout << "Testing stride=\t" << stride << "\t, time=\t" << time + << std::endl; + + if (dj.push(time)) { + cacheline_size = stride * sizeof(float); + break; + } + } + if (stride >= MAX_STRIDE) { + std::cout << "Unable to conclude a top level buffer cacheline size." + << std::endl; + cacheline_size = MAX_STRIDE * sizeof(float); + } + + std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl; +} + +void _bandwidth(App& app, std::string memtype, uint32_t range) { + auto memtype_lower = memtype; + std::transform( + memtype_lower.begin(), + memtype_lower.end(), + memtype_lower.begin(), + [](unsigned char c) { return std::tolower(c); }); + + auto test_name = memtype_lower + "_bandwidth"; + + // Cache lines flushed + const uint32_t NFLUSH = app.get_config(test_name, "nflush"); + // Number of loop unrolls. Changing this value requires an equal change in + // buf_bandwidth.yaml + const uint32_t NUNROLL = app.get_config(test_name, "nunroll"); + // Number of iterations. Increasing this value reduces noise in exchange for + // higher latency. + const uint32_t NITER = app.get_config(test_name, "niter"); + // Vector dimensions (vec4) + const uint32_t VEC_WIDTH = 4; + const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); + // Number of vectors that fit in the selected memory space + const uint32_t NVEC = range / VEC_SIZE; + // Number of memory reads per thread + const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; + // Number of threads needed to read al l vectors + // The thread count doesn't divide by thread workload in shared memory + // because of the limited memory size. + const uint32_t NTHREAD = memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD; + // Occupy all threads + const uint32_t local_x = app.nthread_logic; + // Ensure that global is a multiple of local, and distribute across all SMs + const uint32_t global_x = + (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH; + + auto bench = [&](uint32_t access_size) { + // Number of vectors that fit in this iteration + const uint32_t nvec_access = access_size / VEC_SIZE; + + StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); + StorageBuffer out_buf( + context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "buf_bandwidth_" + memtype_lower; + + auto time = benchmark_on_gpu(shader_name, 10, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {global_x, 1, 1}, + {local_x, 1, 1}, + {SV(NITER), SV(nvec_access), SV(local_x)}, + VK_NULL_HANDLE, + 0, + in_buf.buffer(), + out_buf.buffer()); + }); + + const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; + auto gbps = SIZE_TRANS * 1e-3 / time; + std::cout << memtype << " bandwidth accessing \t" << access_size + << "\tB unique data is \t" << gbps << " \tgbps (\t" << time + << "\tus)" << std::endl; + return gbps; + }; + + double max_bandwidth = 0; + double min_bandwidth = DBL_MAX; + for (uint32_t access_size = VEC_SIZE; access_size < range; access_size *= 2) { + double gbps = bench(access_size); + max_bandwidth = std::max(gbps, max_bandwidth); + min_bandwidth = std::min(gbps, min_bandwidth); + } + + std::cout << "Max" << memtype << "Bandwidth (GB/s)," << max_bandwidth + << std::endl; + std::cout << "Min" << memtype << "Bandwidth (GB/s)," << min_bandwidth + << std::endl; +} + +void buf_bandwidth(App& app) { + if (!app.enabled("buffer_bandwidth")) { + std::cout << "Skipped Memory Bandwidth" << std::endl; + return; + } + + std::cout << "\n------ Memory Bandwidth ------" << std::endl; + // Maximum memory space read - 128MB + // For regular devices, bandwidth plateaus at less memory than this, so more + // is not needed. + const uint32_t RANGE = app.get_config("buffer_bandwidth", "range"); + _bandwidth(app, "Buffer", RANGE); +} + +void ubo_bandwidth(App& app) { + if (!app.enabled("ubo_bandwidth")) { + std::cout << "Skipped UBO Bandwidth" << std::endl; + return; + } + + std::cout << "\n------ UBO Bandwidth ------" << std::endl; + const uint32_t RANGE = app.get_config("ubo_bandwidth", "range"); + _bandwidth(app, "UBO", RANGE); +} + +void shared_mem_bandwidth(App& app) { + if (!app.enabled("shared_mem_bandwidth")) { + std::cout << "Skipped Shared Memory Bandwidth" << std::endl; + return; + } + + std::cout << "\n------ Shared Bandwidth ------" << std::endl; + const uint32_t RANGE = app.max_shared_mem_size; + _bandwidth(app, "Shared", RANGE); +} +} // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h new file mode 100644 index 00000000000..f8874306c0a --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/include/textures.h @@ -0,0 +1,226 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include "app.h" +#include "stats.h" +#include "utils.h" + +namespace gpuinfo { + +// To improve render quality, the textures are usually filtered and +// interpolated to smooth out color steps in the original image, which +// requires multiple access to a 3D area nearby the point of interpolation. +// Textures are therefore drastically different from buffers, they need to +// provide rapid access to data points in a 3D patch; and each data point has +// four elements, as it's originally been designed for, corresponding to the +// components in the RGB color space with an extra alpha channel. Also, unlike +// a buffer which refers to a contiguous range of memory, a texture is an +// opaque object, defined by GPU vendors and the users usually have no +// knowledge about its actual data layout, so it is possible that the memory +// *does not* layout linearly. These characteristics prevent us from designing +// the addressing and vectorization of data. The only general optimization we +// can do is to align the amount of data accessed at a time to the relatively +// small top-level cache system, i.e., the L1 texture cache. +// +// In the experiment, we assume L1 cacheline size is small enough that +// several threads reading float4s can exceed it. In both direction, along the +// width and the height, each logically concurrent thread in an SM reads a +// float4. Such memory access should be satisfied by a single cache fetch, but +// if the cache is not large enough to contain all requested data, multiple +// fetches will significantly increase access latency. +void tex_cacheline_size(App& app) { + if (!app.enabled("tex_cacheline_size")) { + std::cout << "Skipped Texture Cacheline Size" << std::endl; + return; + } + + const double COMPENSATE = app.get_config("tex_cacheline_size", "compensate"); + const double THRESHOLD = app.get_config("tex_cacheline_size", "threshold"); + + uint32_t concur_nthread_by_dim[3]; + + for (int dim = 0; dim < 3; ++dim) { + std::cout << std::endl; + std::cout << "------ Texture Cacheline Size (dim = " << dim << ") ------" + << std::endl; + + uint32_t NITER; + + const uint32_t IMG_OTHER_EDGE = dim == 0 ? app.max_tex_width + : dim == 1 ? app.max_tex_height + : app.max_tex_depth; + + const uint32_t MAX_NTHREAD = std::min(app.nthread_logic, IMG_OTHER_EDGE); + + uint32_t& concur_nthread = concur_nthread_by_dim[dim]; + + auto bench = [&](uint32_t nthread) { + std::vector sizes_whd = { + app.max_tex_width, app.max_tex_height, app.max_tex_depth}; + + auto sizes_nchw = whd_to_nchw(sizes_whd); + + vTensor in_tensor = + api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); + + // Single vec4 + StorageBuffer out_buf(context(), vkapi::kFloat, 4); + + vkapi::PipelineBarrier pipeline_barrier{}; + + auto shader_name = "tex_cacheline_size_" + std::to_string(dim); + + auto time = benchmark_on_gpu(shader_name, 100, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {nthread, 1, 1}, + {nthread, 1, 1}, + {SV(NITER)}, + VK_NULL_HANDLE, + 0, + in_tensor.image(), + out_buf.buffer()); + }); + return time; + }; + + ensure_min_niter(1000, NITER, [&]() { return bench(1); }); + + DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); + uint32_t nthread = 1; + for (; nthread <= MAX_NTHREAD; ++nthread) { + double time = bench(nthread); + std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time + << std::endl; + + if (dj.push(time)) { + concur_nthread = nthread - 1; + std::cout << "Can concurrently access " << concur_nthread << "px with " + << "minimal cost along dim=" << dim << std::endl; + break; + } + } + if (nthread >= MAX_NTHREAD) { + std::cout << "Unable to conclude a top level texture cacheline size." + << std::endl; + } else { + concur_nthread_by_dim[dim] = concur_nthread; + } + } + + uint32_t TEXEL_SIZE = 4 * sizeof(float); + const uint32_t concur_nthread_x = concur_nthread_by_dim[0]; + const uint32_t concur_nthread_y = concur_nthread_by_dim[1]; + + uint32_t cacheline_size = TEXEL_SIZE * + std::max(concur_nthread_x, concur_nthread_y) / + std::min(concur_nthread_x, concur_nthread_y); + + std::cout << "TextureCachelineSize," << cacheline_size << std::endl; + + std::string cacheline_dim; + cacheline_dim = concur_nthread_x >= concur_nthread_y ? "X" : "Y"; + std::cout << "TextureCachelineDim," << cacheline_dim << std::endl; +} + +void tex_bandwidth(App& app) { + if (!app.enabled("tex_bandwidth")) { + std::cout << "Skipped Texture Bandwidth" << std::endl; + return; + } + + for (int dim = 0; dim < 3; dim++) { + std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------" + << std::endl; + const uint32_t MAX_SIZE = dim == 0 ? app.max_tex_width + : dim == 1 ? app.max_tex_height + : app.max_tex_depth; + + // rgba, float + const uint32_t VEC_WIDTH = 4; + const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); + const uint32_t NVEC = MAX_SIZE; + + const uint32_t RANGE = NVEC * VEC_SIZE; + + // Cache lines flushed + const uint32_t NFLUSH = app.get_config("tex_bandwidth", "nflush"); + // Number of loop unrolls. Changing this value requires an equal change in + // tex_bandwidth.yaml + const uint32_t NUNROLL = app.get_config("tex_bandwidth", "nunroll"); + // Number of iterations. Increasing this value reduces noise in exchange + // for higher latency. + const uint32_t NITER = app.get_config("tex_bandwidth", "niter"); + // Number of memory reads per thread + const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; + // Number of threads needed to read all texells + const uint32_t NTHREAD = NVEC; + // Occupy all threads + const uint32_t local_x = app.nthread_logic; + // Ensure that global is a multiple of local, and distribute across all + // SMs + const uint32_t global_x = + (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH; + + auto shader_name = "tex_bandwidth_" + std::to_string(dim); + + std::vector sizes_whd = {MAX_SIZE, 1, 1}; + if (dim == 1) { + sizes_whd = {1, MAX_SIZE, 1}; + } else if (dim == 2) { + sizes_whd = {1, 1, MAX_SIZE}; + } + auto sizes_nchw = whd_to_nchw(sizes_whd); + + vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); + + auto bench = [&](uint32_t access_size, uint32_t dim) { + // Number of texels that fit in this iteration + const uint32_t ntexel_access = access_size / VEC_SIZE; + + StorageBuffer out_buf( + context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); + vkapi::PipelineBarrier pipeline_barrier{}; + + auto time = benchmark_on_gpu(shader_name, 10, [&]() { + context()->submit_compute_job( + VK_KERNEL_FROM_STR(shader_name), + pipeline_barrier, + {global_x, 1, 1}, + {local_x, 1, 1}, + {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)}, + VK_NULL_HANDLE, + 0, + in_tensor.image(), + out_buf.buffer()); + }); + + const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; + double gbps = SIZE_TRANS * 1e-3 / time; + std::cout << "Texture bandwidth accessing \t" << access_size + << "\tB unique data is \t" << gbps << " \tgbps (\t" << time + << "\tus)" << std::endl; + return gbps; + }; + + double max_bandwidth = 0; + double min_bandwidth = DBL_MAX; + for (uint32_t access_size = VEC_SIZE; access_size < RANGE; + access_size *= 2) { + double gbps = bench(access_size, dim); + max_bandwidth = std::max(gbps, max_bandwidth); + min_bandwidth = std::min(gbps, min_bandwidth); + } + + std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth + << std::endl; + std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth + << std::endl; + } +} +} // namespace gpuinfo diff --git a/backends/vulkan/tools/gpuinfo/include/utils.h b/backends/vulkan/tools/gpuinfo/include/utils.h index 231fb32c5a9..887cb443ef4 100644 --- a/backends/vulkan/tools/gpuinfo/include/utils.h +++ b/backends/vulkan/tools/gpuinfo/include/utils.h @@ -54,6 +54,15 @@ void ensure_min_niter( } } +std::vector whd_to_nchw(std::vector sizes) { + const int64_t W = sizes[0]; + const int64_t H = sizes[1]; + const int64_t D = sizes[2]; + + // Channels-packed: {W, H, D} = {W, H, (C / 4) * N} + return {1, D * 4, H, W}; +} + cl_platform_id get_cl_platform_id() { cl_uint nplatform_id; clGetPlatformIDs(0, nullptr, &nplatform_id); diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp deleted file mode 100644 index 42631702f5e..00000000000 --- a/backends/vulkan/tools/gpuinfo/src/app.cpp +++ /dev/null @@ -1,805 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include -#include - -#include "stats.h" -#include "utils.h" - -using namespace vkapi; - -class App { - private: - size_t buf_cache_size_; - uint32_t max_shared_mem_size_; - uint32_t sm_count_; - uint32_t nthread_logic_; - uint32_t subgroup_size_; - uint32_t max_tex_width_; - uint32_t max_tex_height_; - uint32_t max_tex_depth_; - folly::dynamic config_; - - std::vector _whd_to_nchw(std::vector sizes) { - const int64_t W = sizes[0]; - const int64_t H = sizes[1]; - const int64_t D = sizes[2]; - - // Channels-packed: {W, H, D} = {W, H, (C / 4) * N} - return {1, D * 4, H, W}; - } - - float _get_config(const std::string& test, const std::string& key) { - if (config_[test].empty()) { - throw std::runtime_error("Missing config for " + test); - } - - if (!config_[test][key].isNumber()) { - throw std::runtime_error( - "Config for " + test + "." + key + " is not a number"); - } - - float value; - if (config_[test][key].isDouble()) { - value = config_[test][key].getDouble(); - } else { - value = config_[test][key].getInt(); - } - - std::cout << "Read value for " << test << "." << key << " = " << value - << std::endl; - return value; - } - - bool _enabled(const std::string& test) { - if (config_.empty() || config_[test].empty() || - !config_[test]["enabled"].isBool()) { - return true; - } - return config_[test]["enabled"].getBool(); - } - - public: - App() { - context()->initialize_querypool(); - - std::cout << context()->adapter_ptr()->stringize() << std::endl - << std::endl; - - auto cl_device = get_cl_device(); - - sm_count_ = cl_device.getInfo(); - nthread_logic_ = cl_device.getInfo(); - buf_cache_size_ = cl_device.getInfo(); - max_shared_mem_size_ = cl_device.getInfo(); - max_tex_width_ = cl_device.getInfo(); - max_tex_height_ = cl_device.getInfo(); - max_tex_depth_ = cl_device.getInfo(); - - VkPhysicalDeviceSubgroupProperties subgroup_props{}; - VkPhysicalDeviceProperties2 props2{}; - - props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; - props2.pNext = &subgroup_props; - subgroup_props.sType = - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES; - vkGetPhysicalDeviceProperties2( - context()->adapter_ptr()->physical_handle(), &props2); - subgroup_size_ = subgroup_props.subgroupSize; - - std::cout << std::endl; - std::cout << "SM count," << sm_count_ << std::endl; - std::cout << "Logic Thread Count," << nthread_logic_ << std::endl; - std::cout << "Cache Size," << buf_cache_size_ << std::endl; - std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl; - std::cout << "SubGroup Size," << subgroup_size_ << std::endl; - std::cout << "MaxTexWidth," << max_tex_width_ << std::endl; - std::cout << "MaxTexHeight," << max_tex_height_ << std::endl; - std::cout << "MaxTexDepth," << max_tex_depth_ << std::endl; - } - - void load_config(std::string file_path) { - std::ifstream file(file_path); - std::stringstream buffer; - buffer << file.rdbuf(); - const std::string json_str = buffer.str(); - if (json_str.empty()) { - throw std::runtime_error( - "Failed to read config file from " + file_path + "."); - } - config_ = folly::parseJson(json_str); - } - - void reg_count() { - if (!_enabled("reg_count")) { - std::cout << "Skipped Register Count" << std::endl; - return; - } - - std::cout << std::endl; - std::cout << "------ Register Count ------" << std::endl; - const uint32_t NREG_MIN = 1; - const uint32_t NREG_MAX = 512; - const uint32_t NREG_STEP = 1; - - const double COMPENSATE = _get_config("reg_count", "compensate"); - const double THRESHOLD = _get_config("reg_count", "threshold"); - - const uint32_t NGRP_MIN = 1; - const uint32_t NGRP_MAX = 64; - const uint32_t NGRP_STEP = 1; - - uint32_t NITER; - - auto bench = [&](uint32_t ngrp, uint32_t nreg) { - StorageBuffer buffer(context(), vkapi::kFloat, 1); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "reg_count_" + std::to_string(nreg); - - auto time = benchmark_on_gpu(shader_name, 100, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {1, ngrp, 1}, - {1, 1, 1}, - {SV(NITER)}, - VK_NULL_HANDLE, - 0, - buffer.buffer()); - }); - return time; - }; - - std::cout << "Calculating NITER..." << std::endl; - ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); }); - std::cout << "NITER," << NITER << std::endl; - - uint32_t nreg_max; - - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - uint32_t nreg = NREG_MIN; - for (; nreg <= NREG_MAX; nreg += NREG_STEP) { - double time = bench(1, nreg); - std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time - << std::endl; - if (dj.push(time)) { - nreg -= NREG_STEP; - nreg_max = nreg; - break; - } - } - if (nreg >= NREG_MAX) { - std::cout << "Unable to conclude a maximal register count" << std::endl; - nreg_max = NREG_STEP; - } else { - std::cout << nreg_max << " registers are available at most" << std::endl; - } - - auto find_ngrp_by_nreg = [&](const uint32_t nreg) { - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) { - auto time = bench(ngrp, nreg); - std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp - << ", time=" << time << " us" << std::endl; - - if (dj.push(time)) { - ngrp -= NGRP_STEP; - std::cout << "Using " << nreg << " registers can have " << ngrp - << " concurrent single-thread workgroups" << std::endl; - return ngrp; - } - } - std::cout - << "Unable to conclude a maximum number of concurrent single-thread workgroups when " - << nreg << " registers are occupied" << std::endl; - return (uint32_t)1; - }; - - uint32_t ngrp_full, ngrp_half; - ngrp_full = find_ngrp_by_nreg(nreg_max); - ngrp_half = find_ngrp_by_nreg(nreg_max / 2); - - std::string reg_ty; - - if (ngrp_full * 1.5 < ngrp_half) { - std::cout << "All physical threads in an sm share " << nreg_max - << " registers" << std::endl; - reg_ty = "Pooled"; - - } else { - std::cout << "Each physical thread has " << nreg_max << " registers" - << std::endl; - reg_ty = "Dedicated"; - } - - std::cout << std::endl << std::endl; - std::cout << "NITER," << NITER << std::endl; - std::cout << "Max registers," << nreg_max << std::endl; - std::cout << "Concurrent full single thread workgroups," << ngrp_full - << std::endl; - std::cout << "Concurrent half single thread workgroups," << ngrp_half - << std::endl; - std::cout << "Register type," << reg_ty << std::endl; - } - - void buf_cacheline_size() { - if (!_enabled("buf_cacheline_size")) { - std::cout << "Skipped Buffer Cacheline Size" << std::endl; - return; - } - - std::cout << std::endl; - std::cout << "------ Buffer Cacheline Size ------" << std::endl; - - const double COMPENSATE = _get_config("buf_cacheline_size", "compensate"); - const double THRESHOLD = _get_config("buf_cacheline_size", "threshold"); - - const uint32_t PITCH = buf_cache_size_ / nthread_logic_; - const uint32_t BUF_SIZE = buf_cache_size_; - const uint32_t MAX_STRIDE = PITCH; - - uint32_t NITER; - - auto bench = [&](int stride) { - StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); - StorageBuffer out_buf(context(), vkapi::kFloat, 1); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "buf_cacheline_size"; - - auto time = benchmark_on_gpu(shader_name, 100, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {nthread_logic_, 1, 1}, - {nthread_logic_, 1, 1}, - {SV(NITER), SV(stride), SV(PITCH)}, - VK_NULL_HANDLE, - 0, - in_buf.buffer(), - out_buf.buffer()); - }); - return time; - }; - - ensure_min_niter(1000, NITER, [&]() { return bench(1); }); - - uint32_t cacheline_size; - - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - uint32_t stride = 1; - for (; stride <= MAX_STRIDE; ++stride) { - double time = bench(stride); - std::cout << "Testing stride=\t" << stride << "\t, time=\t" << time - << std::endl; - - if (dj.push(time)) { - cacheline_size = stride * sizeof(float); - break; - } - } - if (stride >= MAX_STRIDE) { - std::cout << "Unable to conclude a top level buffer cacheline size." - << std::endl; - cacheline_size = MAX_STRIDE * sizeof(float); - } - - std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl; - } - - // Textures are drastically different from buffers in terms of data layout. - // While buffers are a contiguous range of memory, textures are opaque objects - // defined by the vendor and it is possible that nearby points of data are not - // neighboring in memory. Likewise, data points are accessed in - // multi-dimensional patches instead of simple lines. This makes the stride - // method for figuring out the cache line size not applicable. To go around - // this, this experiment runs an increasing amount of threads accessing - // different datapoints in the texture and measures latency. If the cache line - // is big enough for all threads to access it at the same time, latency will - // be low. When there are more threads than what a single cache line can - // handle, a second line must be fetched, increasing latency in a measurable - // way. With this, we can find the cache line size of all three dimensions. - void tex_cacheline_size() { - if (!_enabled("tex_cacheline_size")) { - std::cout << "Skipped Texture Cacheline Size" << std::endl; - return; - } - - const double COMPENSATE = _get_config("tex_cacheline_size", "compensate"); - const double THRESHOLD = _get_config("tex_cacheline_size", "threshold"); - - uint32_t concur_nthread_by_dim[3]; - - for (int dim = 0; dim < 3; ++dim) { - std::cout << std::endl; - std::cout << "------ Texture Cacheline Size (dim = " << dim << ") ------" - << std::endl; - - uint32_t NITER; - - const uint32_t IMG_OTHER_EDGE = dim == 0 ? max_tex_width_ - : dim == 1 ? max_tex_height_ - : max_tex_depth_; - - const uint32_t MAX_NTHREAD = std::min(nthread_logic_, IMG_OTHER_EDGE); - - uint32_t& concur_nthread = concur_nthread_by_dim[dim]; - - auto bench = [&](uint32_t nthread) { - std::vector sizes_whd = { - max_tex_width_, max_tex_height_, max_tex_depth_}; - - auto sizes_nchw = _whd_to_nchw(sizes_whd); - - vTensor in_tensor = - api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); - - // Single vec4 - StorageBuffer out_buf(context(), vkapi::kFloat, 4); - - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "tex_cacheline_size_" + std::to_string(dim); - - auto time = benchmark_on_gpu(shader_name, 100, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {nthread, 1, 1}, - {nthread, 1, 1}, - {SV(NITER)}, - VK_NULL_HANDLE, - 0, - in_tensor.image(), - out_buf.buffer()); - }); - return time; - }; - - ensure_min_niter(1000, NITER, [&]() { return bench(1); }); - - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - uint32_t nthread = 1; - for (; nthread <= MAX_NTHREAD; ++nthread) { - double time = bench(nthread); - std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time - << std::endl; - - if (dj.push(time)) { - concur_nthread = nthread - 1; - std::cout << "Can concurrently access " << concur_nthread - << "px with " << "minimal cost along dim=" << dim - << std::endl; - break; - } - } - if (nthread >= MAX_NTHREAD) { - std::cout - << "Unable to conclude a top level texture cacheline size for dim " - << dim << std::endl; - } else { - concur_nthread_by_dim[dim] = concur_nthread; - } - } - - uint32_t TEXEL_SIZE = 4 * sizeof(float); - const uint32_t concur_nthread_x = concur_nthread_by_dim[0]; - const uint32_t concur_nthread_y = concur_nthread_by_dim[1]; - - uint32_t cacheline_size = TEXEL_SIZE * - std::max(concur_nthread_x, concur_nthread_y) / - std::min(concur_nthread_x, concur_nthread_y); - - std::cout << "TextureCachelineSize," << cacheline_size << std::endl; - - std::string cacheline_dim; - cacheline_dim = concur_nthread_x >= concur_nthread_y ? "X" : "Y"; - std::cout << "TextureCachelineDim," << cacheline_dim << std::endl; - } - - private: - void _bandwidth(std::string memtype, uint32_t range) { - auto memtype_lower = memtype; - std::transform( - memtype_lower.begin(), - memtype_lower.end(), - memtype_lower.begin(), - [](unsigned char c) { return std::tolower(c); }); - - auto test_name = memtype_lower + "_bandwidth"; - - // Cache lines flushed - const uint32_t NFLUSH = _get_config(test_name, "nflush"); - // Number of loop unrolls. Changing this value requires an equal change in - // buf_bandwidth.yaml - const uint32_t NUNROLL = _get_config(test_name, "nunroll"); - // Number of iterations. Increasing this value reduces noise in exchange for - // higher latency. - const uint32_t NITER = _get_config(test_name, "niter"); - // Vector dimensions (vec4) - const uint32_t VEC_WIDTH = 4; - const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); - // Number of vectors that fit in the selected memory space - const uint32_t NVEC = range / VEC_SIZE; - // Number of memory reads per thread - const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; - // Number of threads needed to read al l vectors - // The thread count doesn't divide by thread workload in shared memory - // because of the limited memory size. - const uint32_t NTHREAD = - memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD; - // Occupy all threads - const uint32_t local_x = nthread_logic_; - // Ensure that global is a multiple of local, and distribute across all SMs - const uint32_t global_x = - (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH; - - auto bench = [&](uint32_t access_size) { - // Number of vectors that fit in this iteration - const uint32_t nvec_access = access_size / VEC_SIZE; - - StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); - StorageBuffer out_buf( - context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "buf_bandwidth_" + memtype_lower; - - auto time = benchmark_on_gpu(shader_name, 10, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {global_x, 1, 1}, - {local_x, 1, 1}, - {SV(NITER), SV(nvec_access), SV(local_x)}, - VK_NULL_HANDLE, - 0, - in_buf.buffer(), - out_buf.buffer()); - }); - - const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; - auto gbps = SIZE_TRANS * 1e-3 / time; - std::cout << memtype << " bandwidth accessing \t" << access_size - << "\tB unique data is \t" << gbps << " \tgbps (\t" << time - << "\tus)" << std::endl; - return gbps; - }; - - double max_bandwidth = 0; - double min_bandwidth = DBL_MAX; - for (uint32_t access_size = VEC_SIZE; access_size < range; - access_size *= 2) { - double gbps = bench(access_size); - max_bandwidth = std::max(gbps, max_bandwidth); - min_bandwidth = std::min(gbps, min_bandwidth); - } - - std::cout << "Max" << memtype << "Bandwidth (GB/s)," << max_bandwidth - << std::endl; - std::cout << "Min" << memtype << "Bandwidth (GB/s)," << min_bandwidth - << std::endl; - } - - public: - void buf_bandwidth() { - if (!_enabled("buffer_bandwidth")) { - std::cout << "Skipped Memory Bandwidth" << std::endl; - return; - } - - std::cout << "\n------ Memory Bandwidth ------" << std::endl; - // Maximum memory space read - 128MB - // For regular devices, bandwidth plateaus at less memory than this, so more - // is not needed. - const uint32_t RANGE = _get_config("buffer_bandwidth", "range"); - _bandwidth("Buffer", RANGE); - } - - void ubo_bandwidth() { - if (!_enabled("ubo_bandwidth")) { - std::cout << "Skipped UBO Bandwidth" << std::endl; - return; - } - - std::cout << "\n------ UBO Bandwidth ------" << std::endl; - const uint32_t RANGE = _get_config("ubo_bandwidth", "range"); - _bandwidth("UBO", RANGE); - } - - void shared_mem_bandwidth() { - if (!_enabled("shared_mem_bandwidth")) { - std::cout << "Skipped Shared Memory Bandwidth" << std::endl; - return; - } - - std::cout << "\n------ Shared Bandwidth ------" << std::endl; - const uint32_t RANGE = max_shared_mem_size_; - _bandwidth("Shared", RANGE); - } - - void tex_bandwidth() { - if (!_enabled("tex_bandwidth")) { - std::cout << "Skipped Texture Bandwidth" << std::endl; - return; - } - - for (int dim = 0; dim < 3; dim++) { - std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------" - << std::endl; - const uint32_t MAX_SIZE = dim == 0 ? max_tex_width_ - : dim == 1 ? max_tex_height_ - : max_tex_depth_; - - // rgba, float - const uint32_t VEC_WIDTH = 4; - const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float); - const uint32_t NVEC = MAX_SIZE; - - const uint32_t RANGE = NVEC * VEC_SIZE; - - // Cache lines flushed - const uint32_t NFLUSH = _get_config("tex_bandwidth", "nflush"); - // Number of loop unrolls. Changing this value requires an equal change in - // tex_bandwidth.yaml - const uint32_t NUNROLL = _get_config("tex_bandwidth", "nunroll"); - // Number of iterations. Increasing this value reduces noise in exchange - // for higher latency. - const uint32_t NITER = _get_config("tex_bandwidth", "niter"); - // Number of memory reads per thread - const uint32_t NREAD_PER_THREAD = NUNROLL * NITER; - // Number of threads needed to read all texells - const uint32_t NTHREAD = NVEC; - // Occupy all threads - const uint32_t local_x = nthread_logic_; - // Ensure that global is a multiple of local, and distribute across all - // SMs - const uint32_t global_x = - (NTHREAD / local_x * local_x) * sm_count_ * NFLUSH; - - auto shader_name = "tex_bandwidth_" + std::to_string(dim); - - std::vector sizes_whd = {MAX_SIZE, 1, 1}; - if (dim == 1) { - sizes_whd = {1, MAX_SIZE, 1}; - } else if (dim == 2) { - sizes_whd = {1, 1, MAX_SIZE}; - } - auto sizes_nchw = _whd_to_nchw(sizes_whd); - - vTensor in_tensor = - api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); - - auto bench = [&](uint32_t access_size, uint32_t dim) { - // Number of texels that fit in this iteration - const uint32_t ntexel_access = access_size / VEC_SIZE; - - StorageBuffer out_buf( - context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto time = benchmark_on_gpu(shader_name, 10, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {global_x, 1, 1}, - {local_x, 1, 1}, - {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)}, - VK_NULL_HANDLE, - 0, - in_tensor.image(), - out_buf.buffer()); - }); - - const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE; - double gbps = SIZE_TRANS * 1e-3 / time; - std::cout << "Texture bandwidth accessing \t" << access_size - << "\tB unique data is \t" << gbps << " \tgbps (\t" << time - << "\tus)" << std::endl; - return gbps; - }; - - double max_bandwidth = 0; - double min_bandwidth = DBL_MAX; - for (uint32_t access_size = VEC_SIZE; access_size < RANGE; - access_size *= 2) { - double gbps = bench(access_size, dim); - max_bandwidth = std::max(gbps, max_bandwidth); - min_bandwidth = std::min(gbps, min_bandwidth); - } - - std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth - << std::endl; - std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth - << std::endl; - } - } - - // Warp size is a difficult metric to obtain because the hardware limitations - // do not always coincide with the way the SM divides the workload. For - // instance, the hardware can have a warp size of 64 threads, but an SM might - // be able to simulate concurrency of 128 threads with a single scheduler. - - // Because of this, it is important to measure the warp size different ways, - // that can evidence both the physical limitations of the hardware, and the - // actual behavior of the driver. - - // Additionally,the SM can behave in two different ways when the assigned - // workload is smaller than the warp size. - - // In Case 1, like ARM Mali, the SM can assign dummy workloads to fill empty - // threads and maintain a uniform workload. - - // In Case 2, like in Adreno, the driver might decide to pack multiple works - // together and dispatch them at once. - void warp_size(bool verbose = false) { - if (!_enabled("warp_size")) { - std::cout << "Skipped Warp Size" << std::endl; - return; - } - - std::cout << "\n------ Warp Size ------" << std::endl; - - // Method A: Stress test with a kernel that uses complex ALU operations like - // integer division to avoid latency hiding. Increase the number of threads - // until a jump in latency is detected. - - // This timing-based method helps us identify physical warp sizes. It also - // helps with Case 2, when threads of multiple warps are managed by the same - // scheduler at the same time. - const double COMPENSATE = _get_config("warp_size", "compensate"); - const double THRESHOLD = _get_config("warp_size", "threshold"); - - uint32_t NITER; - - auto bench = [&](uint32_t nthread) { - StorageBuffer out_buf(context(), vkapi::kInt, nthread_logic_); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "warp_size_physical"; - - auto time = benchmark_on_gpu(shader_name, 10, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - // Large number of work groups selected to potentially saturate all - // ALUs and thus have a better baseline for comparison. - {nthread, 1024, 1}, - {nthread, 1, 1}, - {SV(NITER)}, - VK_NULL_HANDLE, - 0, - out_buf.buffer()); - }); - - return time; - }; - - ensure_min_niter(1000, NITER, [&]() { return bench(1); }); - - uint32_t warp_size = subgroup_size_; - DtJumpFinder<5> dj(COMPENSATE, THRESHOLD); - - // We increase the number of threads until we hit a jump in the data. - uint32_t nthread = 1; - for (; nthread <= nthread_logic_; ++nthread) { - double time = bench(nthread); - std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)" - << std::endl; - if (dj.push(time)) { - warp_size = nthread - 1; - break; - } - } - if (nthread >= nthread_logic_) { - std::cout - << "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size" - << std::endl; - } - - // Method B: Let all the threads in a warp race and atomically fetch-add - // a counter, then store the counter values to the output buffer in the - // scheduling order of these threads. If all the order numbers follow an - // ascending order, then the threads are likely executing within a warp. - // Threads in different warps are not managed by the same scheduler, so they - // would race for a same ID out of order, unaware of each other. - - // This method evidences the actual driver behavior when running - // concurrency, regardless of the physical limitations of the hardware. - - // Likewise, this method helps us identify warp sizes when the SM - // sub-divides its ALUs into independent groups, like the three execution - // engines in a Mali G76 core. It helps warp-probing in Case 1 because it - // doesn't depend on kernel timing, so the extra wait time doesn't lead to - // inaccuracy. - auto bench_sm = [&](uint32_t nthread) { - StorageBuffer out_buf(context(), vkapi::kInt, nthread_logic_); - vkapi::PipelineBarrier pipeline_barrier{}; - - auto shader_name = "warp_size_scheduler"; - - benchmark_on_gpu(shader_name, 1, [&]() { - context()->submit_compute_job( - VK_KERNEL_FROM_STR(shader_name), - pipeline_barrier, - {nthread, 1, 1}, - {nthread, 1, 1}, - {}, - VK_NULL_HANDLE, - 0, - out_buf.buffer()); - }); - - std::vector data(nthread_logic_); - copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes()); - - if (verbose) { - std::stringstream ss; - for (auto j = 0; j < nthread; ++j) { - ss << data[j] << " "; - } - std::cout << ss.str() << std::endl; - } - - // Check until which point is the data in ascending order. - int32_t last = -1; - int32_t j = 0; - for (; j < nthread; ++j) { - if (last >= data[j]) { - break; - } - last = data[j]; - } - - return j; - }; - - // Test increasing sizes until the data is no longer in ascending order. - uint32_t warp_size_scheduler = warp_size; - int i = 1; - for (; i <= nthread_logic_; ++i) { - uint32_t nascend = bench_sm(i); - if (nascend != i) { - warp_size_scheduler = nascend; - break; - } - } - if (i > nthread_logic_) { - std::cout << "Unable to conclude an SM Warp Size." << std::endl; - } - - std::cout << "PhysicalWarpSize," << warp_size << std::endl; - std::cout << "SMWarpSize," << warp_size_scheduler << std::endl; - } -}; - -int main(int argc, const char** argv) { - App app; - - std::string file_path = "config.json"; - if (argc > 1) { - file_path = argv[1]; - }; - app.load_config(file_path); - - app.reg_count(); - app.buf_cacheline_size(); - app.buf_bandwidth(); - app.ubo_bandwidth(); - app.shared_mem_bandwidth(); - app.warp_size(); - app.tex_bandwidth(); - app.tex_cacheline_size(); - - return 0; -} diff --git a/backends/vulkan/tools/gpuinfo/src/main.cpp b/backends/vulkan/tools/gpuinfo/src/main.cpp new file mode 100644 index 00000000000..92b11ce9b4b --- /dev/null +++ b/backends/vulkan/tools/gpuinfo/src/main.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "app.h" +#include "architecture.h" +#include "buffers.h" +#include "textures.h" + +using namespace vkapi; + +int main(int argc, const char** argv) { + gpuinfo::App app; + + std::string file_path = "config.json"; + if (argc > 1) { + file_path = argv[1]; + }; + app.load_config(file_path); + + // Architecture + gpuinfo::reg_count(app); + gpuinfo::warp_size(app); + + // Buffers + gpuinfo::buf_cacheline_size(app); + gpuinfo::buf_bandwidth(app); + gpuinfo::ubo_bandwidth(app); + gpuinfo::shared_mem_bandwidth(app); + + // Textures + gpuinfo::tex_bandwidth(app); + gpuinfo::tex_cacheline_size(app); + + return 0; +}