From 32d485634d59214d4202a634b3e927a29cee11aa Mon Sep 17 00:00:00 2001 From: Esteban Padilla Cerdio Date: Tue, 30 Jul 2024 10:44:32 -0700 Subject: [PATCH] Move calculations away from GPU in Bandwidth profilers (#4445) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4445 This is simply to have a more accurate result when doing bandwidth profiling by removing calculations that can be done outside the shader, leaving only the read operations behind. Reviewed By: copyrightly Differential Revision: D60396870 --- .../vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl | 13 +++++-------- .../vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl | 13 +++++-------- backends/vulkan/tools/gpuinfo/include/buffers.h | 15 ++++++++++++++- backends/vulkan/tools/gpuinfo/include/textures.h | 15 ++++++++++++++- 4 files changed, 38 insertions(+), 18 deletions(-) diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl index c16ad5d14ba..38c9befec6f 100644 --- a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl +++ b/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl @@ -26,6 +26,11 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int niter = 1; layout(constant_id = 4) const int nvec = 1; layout(constant_id = 5) const int local_group_size = 1; +// The address mask works as a modulo because x % 2^n == x & (2^n - 1). +// This will help us limit address accessing to a specific set of unique +// addresses depending on the access size we want to measure. +layout(constant_id = 6) const int addr_mask = 1; +layout(constant_id = 7) const int workgroup_width = 1; $if MEMTYPE == "shared": shared vec4 A[nvec]; @@ -36,15 +41,7 @@ void main() { A[gl_LocalInvocationID[0]][0] = gl_LocalInvocationID[0]; memoryBarrierShared(); - // The address mask works as a modulo because x % 2^n == x & (2^n - 1). - // This will help us limit address accessing to a specific set of unique - // addresses depending on the access size we want to measure. - const int addr_mask = nvec - 1; vec4 sum = vec4(0); - - // This is to distribute the accesses to unique addresses across the workgroups, once the - // size of the access excedes the workgroup width. - const uint workgroup_width = local_group_size * niter * ${NUNROLL}; uint offset = (gl_WorkGroupID[0] * workgroup_width + gl_LocalInvocationID[0]) & addr_mask; int i = 0; diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl index d848fc04754..7ab67bd2d0a 100644 --- a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl +++ b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl @@ -21,17 +21,14 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int niter = 1; layout(constant_id = 4) const int nvec = 1; layout(constant_id = 5) const int local_group_size = 1; +// The address mask works as a modulo because x % 2^n == x & (2^n - 1). +// This will help us limit address accessing to a specific set of unique +// addresses depending on the access size we want to measure. +layout(constant_id = 6) const int addr_mask = 1; +layout(constant_id = 7) const int workgroup_width = 1; void main() { - // The address mask works as a modulo because x % 2^n == x & (2^n - 1). - // This will help us limit address accessing to a specific set of unique - // addresses depending on the access size we want to measure. - const int addr_mask = nvec - 1; vec4 sum = vec4(0); - - // This is to distribute the accesses to unique addresses across the workgroups, once the - // size of the access excedes the workgroup width. - const uint workgroup_width = local_group_size * niter * ${NUNROLL}; uint offset = (gl_WorkGroupID[0] * workgroup_width + gl_LocalInvocationID[0]) & addr_mask; int i = 0; diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h index 8cb0da49ca8..c8cf93c4a12 100644 --- a/backends/vulkan/tools/gpuinfo/include/buffers.h +++ b/backends/vulkan/tools/gpuinfo/include/buffers.h @@ -123,6 +123,15 @@ void _bandwidth( // Number of vectors that fit in this iteration const uint32_t nvec_access = access_size / VEC_SIZE; + // The address mask works as a modulo because x % 2^n == x & (2^n - 1). + // This will help us limit address accessing to a specific set of unique + // addresses depending on the access size we want to measure. + const uint32_t addr_mask = nvec_access - 1; + + // This is to distribute the accesses to unique addresses across the + // workgroups, once the size of the access excedes the workgroup width. + const uint32_t workgroup_width = local_x * NITER * NUNROLL; + StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); StorageBuffer out_buf( context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); @@ -136,7 +145,11 @@ void _bandwidth( pipeline_barrier, {global_x, 1, 1}, {local_x, 1, 1}, - {SV(NITER), SV(nvec_access), SV(local_x)}, + {SV(NITER), + SV(nvec_access), + SV(local_x), + SV(addr_mask), + SV(workgroup_width)}, VK_NULL_HANDLE, 0, in_buf.buffer(), diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h index bb8a3371a96..7679f11b0ca 100644 --- a/backends/vulkan/tools/gpuinfo/include/textures.h +++ b/backends/vulkan/tools/gpuinfo/include/textures.h @@ -164,6 +164,15 @@ void tex_bandwidth(const App& app) { // Number of texels that fit in this iteration const uint32_t ntexel_access = access_size / VEC_SIZE; + // The address mask works as a modulo because x % 2^n == x & (2^n - 1). + // This will help us limit address accessing to a specific set of unique + // addresses depending on the access size we want to measure. + const uint32_t addr_mask = ntexel_access - 1; + + // This is to distribute the accesses to unique addresses across the + // workgroups, once the size of the access excedes the workgroup width. + const uint32_t workgroup_width = local_x * NITER * NUNROLL; + StorageBuffer out_buf( context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; @@ -174,7 +183,11 @@ void tex_bandwidth(const App& app) { pipeline_barrier, {global_x, 1, 1}, {local_x, 1, 1}, - {SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)}, + {SV(NITER), + SV(ntexel_access), + SV(local_x), + SV(addr_mask), + SV(workgroup_width)}, VK_NULL_HANDLE, 0, in_tensor.image(),