Skip to content

Commit

Permalink
Move calculations away from GPU in Bandwidth profilers (#4445)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #4445

This is simply to have a more accurate result when doing bandwidth profiling by removing calculations that can be done outside the shader, leaving only the read operations behind.

Reviewed By: copyrightly

Differential Revision: D60396870
  • Loading branch information
Esteban Padilla Cerdio authored and facebook-github-bot committed Jul 29, 2024
1 parent 0ca42f5 commit 7fcd2e1
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 18 deletions.
13 changes: 5 additions & 8 deletions backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
layout(constant_id = 3) const int niter = 1;
layout(constant_id = 4) const int nvec = 1;
layout(constant_id = 5) const int local_group_size = 1;
// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
// This will help us limit address accessing to a specific set of unique
// addresses depending on the access size we want to measure.
layout(constant_id = 6) const int addr_mask = 1;
layout(constant_id = 7) const int workgroup_width = 1;

$if MEMTYPE == "shared":
shared vec4 A[nvec];
Expand All @@ -36,15 +41,7 @@ void main() {
A[gl_LocalInvocationID[0]][0] = gl_LocalInvocationID[0];
memoryBarrierShared();

// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
// This will help us limit address accessing to a specific set of unique
// addresses depending on the access size we want to measure.
const int addr_mask = nvec - 1;
vec4 sum = vec4(0);

// This is to distribute the accesses to unique addresses across the workgroups, once the
// size of the access excedes the workgroup width.
const uint workgroup_width = local_group_size * niter * ${NUNROLL};
uint offset = (gl_WorkGroupID[0] * workgroup_width + gl_LocalInvocationID[0]) & addr_mask;

int i = 0;
Expand Down
13 changes: 5 additions & 8 deletions backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,14 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
layout(constant_id = 3) const int niter = 1;
layout(constant_id = 4) const int nvec = 1;
layout(constant_id = 5) const int local_group_size = 1;
// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
// This will help us limit address accessing to a specific set of unique
// addresses depending on the access size we want to measure.
layout(constant_id = 6) const int addr_mask = 1;
layout(constant_id = 7) const int workgroup_width = 1;

void main() {
// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
// This will help us limit address accessing to a specific set of unique
// addresses depending on the access size we want to measure.
const int addr_mask = nvec - 1;
vec4 sum = vec4(0);

// This is to distribute the accesses to unique addresses across the workgroups, once the
// size of the access excedes the workgroup width.
const uint workgroup_width = local_group_size * niter * ${NUNROLL};
uint offset = (gl_WorkGroupID[0] * workgroup_width + gl_LocalInvocationID[0]) & addr_mask;

int i = 0;
Expand Down
15 changes: 14 additions & 1 deletion backends/vulkan/tools/gpuinfo/include/buffers.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,15 @@ void _bandwidth(
// Number of vectors that fit in this iteration
const uint32_t nvec_access = access_size / VEC_SIZE;

// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
// This will help us limit address accessing to a specific set of unique
// addresses depending on the access size we want to measure.
const uint32_t addr_mask = nvec_access - 1;

// This is to distribute the accesses to unique addresses across the
// workgroups, once the size of the access excedes the workgroup width.
const uint32_t workgroup_width = local_x * NITER * NUNROLL;

StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
StorageBuffer out_buf(
context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
Expand All @@ -136,7 +145,11 @@ void _bandwidth(
pipeline_barrier,
{global_x, 1, 1},
{local_x, 1, 1},
{SV(NITER), SV(nvec_access), SV(local_x)},
{SV(NITER),
SV(nvec_access),
SV(local_x),
SV(addr_mask),
SV(workgroup_width)},
VK_NULL_HANDLE,
0,
in_buf.buffer(),
Expand Down
15 changes: 14 additions & 1 deletion backends/vulkan/tools/gpuinfo/include/textures.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,15 @@ void tex_bandwidth(const App& app) {
// Number of texels that fit in this iteration
const uint32_t ntexel_access = access_size / VEC_SIZE;

// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
// This will help us limit address accessing to a specific set of unique
// addresses depending on the access size we want to measure.
const uint32_t addr_mask = ntexel_access - 1;

// This is to distribute the accesses to unique addresses across the
// workgroups, once the size of the access excedes the workgroup width.
const uint32_t workgroup_width = local_x * NITER * NUNROLL;

StorageBuffer out_buf(
context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
vkapi::PipelineBarrier pipeline_barrier{};
Expand All @@ -174,7 +183,11 @@ void tex_bandwidth(const App& app) {
pipeline_barrier,
{global_x, 1, 1},
{local_x, 1, 1},
{SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)},
{SV(NITER),
SV(ntexel_access),
SV(local_x),
SV(addr_mask),
SV(workgroup_width)},
VK_NULL_HANDLE,
0,
in_tensor.image(),
Expand Down

0 comments on commit 7fcd2e1

Please sign in to comment.