Skip to content

Commit

Permalink
Add 3D Texture Bandwidth metric (#4336)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #4336

This diff introduces a profiler that obtains the maximum and minimum bandwidth for reading unique addresses from 3D textures in each of its dimensions, using the following shader, where A is a 3D texture and B is a writeonly buffer.

The calculation of the texel position will depend on the dimension that is being benchmarked

x : pos = ivec3(offset, 0, 0)
y : pos = ivec3(0, offset, 0)
z : pos = ivec3(0, 0, offset)

  void main() {
    vec4 sum = vec4(0);
    const uint workgroup_width = local_group_size * niter * ${NUNROLL};
    uint offset = (gl_WorkGroupID[0] * workgroup_width  + gl_LocalInvocationID[0]) & addr_mask;

    int i = 0;
    for (; i < niter; ++i)
    {
        sum *= texelFetch(A, pos, 0);
        offset = (offset + local_group_size) & addr_mask;
        ...
        ...
        sum *= texelFetch(A, pos, 0);
        offset = (offset + local_group_size) & addr_mask;
    }

    vec4 zero = vec4(i>>31);

    B[gl_LocalInvocationID[0]] = sum + zero;
  }

The address mask allows us to control how many unique addresses we are accessing. If the number of unique vectors we want to read is 3, the offset will jump between three unique addresses throughout the iterations, giving us the bandwidth for that specific size of data. If the size of the unique data read is larger than the work group size, then each run will have its own block of data to read, defined by the initial offset calculation, where the offset is obtained through the workgroup ID and the local invocation ID.

Finally, we make sure to use the `sum` and `i	` variables so that the compiler's optimizer does not flatten the loops.

For a Samsung S22, the bandwidth behaves like this for each of the dimensions.
{F1767497386}

Comparing the bandwidth for the X dimension to OpenCL, which was obtained through [ArchProbe](https://github.com/microsoft/ArchProbe), we can observe that, although the behavior is the same, Vulkan has an increased bandwidth for most access sizes.

{F1767497972}

Comparing to the bandwidth for buffers, we can observe that the bandwidth is similar to regular buffers, but still much smaller than UBOs at small access sizes.

 {F1767497707}

Reviewed By: jorgep31415

Differential Revision: D59980139
  • Loading branch information
Esteban Padilla Cerdio authored and facebook-github-bot committed Jul 26, 2024
1 parent 9129892 commit a5f48c2
Show file tree
Hide file tree
Showing 3 changed files with 186 additions and 0 deletions.
59 changes: 59 additions & 0 deletions backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}
#define VEC4_T ${texel_type(DTYPE)}

layout(std430) buffer;

${layout_declare_sampler(0, "r", "A", DTYPE)}
${layout_declare_buffer(1, "w", "B", DTYPE, "PRECISION", False)}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

layout(constant_id = 3) const int niter = 1;
layout(constant_id = 4) const int nvec = 1;
layout(constant_id = 5) const int local_group_size = 1;

void main() {
// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
// This will help us limit address accessing to a specific set of unique
// addresses depending on the access size we want to measure.
const int addr_mask = nvec - 1;
vec4 sum = vec4(0);

// This is to distribute the accesses to unique addresses across the workgroups, once the
// size of the access excedes the workgroup width.
const uint workgroup_width = local_group_size * niter * ${NUNROLL};
uint offset = (gl_WorkGroupID[0] * workgroup_width + gl_LocalInvocationID[0]) & addr_mask;

int i = 0;
for (; i < niter; ++i){
VEC4_T in_texel;
$for j in range(int(NUNROLL)):
$if DIM == 0:
in_texel = texelFetch(A, ivec3(offset, 0, 0), 0);
$elif DIM == 1:
in_texel = texelFetch(A, ivec3(0, offset, 0), 0);
$elif DIM == 2:
in_texel = texelFetch(A, ivec3(0, 0, offset), 0);

sum *= in_texel;

// On each unroll, a new unique address will be accessed through the offset,
// limited by the address mask to a specific set of unique addresses
offset = (offset + local_group_size) & addr_mask;
}

// This is to ensure no compiler optimizations occur
vec4 zero = vec4(i>>31);

B[gl_LocalInvocationID[0]] = sum + zero;
}
15 changes: 15 additions & 0 deletions backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

tex_bandwidth:
parameter_names_with_default_values:
DTYPE: float
NUNROLL: "16"
generate_variant_forall:
DIM:
- RANGE: [0, 2]
shader_variants:
- NAME: tex_bandwidth
112 changes: 112 additions & 0 deletions backends/vulkan/tools/gpuinfo/src/app.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ class App {
uint32_t sm_count_;
uint32_t nthread_logic_;
uint32_t subgroup_size_;
uint32_t max_tex_width_;
uint32_t max_tex_height_;
uint32_t max_tex_depth_;

public:
App() {
Expand All @@ -36,6 +39,9 @@ class App {
nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
max_shared_mem_size_ = cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
max_tex_width_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_WIDTH>();
max_tex_height_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_HEIGHT>();
max_tex_depth_ = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_DEPTH>();

VkPhysicalDeviceSubgroupProperties subgroup_props{};
VkPhysicalDeviceProperties2 props2{};
Expand All @@ -54,6 +60,9 @@ class App {
std::cout << "Cache Size," << buf_cache_size_ << std::endl;
std::cout << "Shared Memory Size," << max_shared_mem_size_ << std::endl;
std::cout << "SubGroup Size," << subgroup_size_ << std::endl;
std::cout << "MaxTexWidth," << max_tex_width_ << std::endl;
std::cout << "MaxTexHeight," << max_tex_height_ << std::endl;
std::cout << "MaxTexDepth," << max_tex_depth_ << std::endl;
}

void reg_count() {
Expand Down Expand Up @@ -308,6 +317,15 @@ class App {
<< std::endl;
}

std::vector<int64_t> _whd_to_nchw(std::vector<int64_t> sizes) {
const int64_t W = sizes[0];
const int64_t H = sizes[1];
const int64_t D = sizes[2];

// Channels-packed: {W, H, D} = {W, H, (C / 4) * N}
return {1, D * 4, H, W};
}

public:
void buf_bandwidth() {
std::cout << "\n------ Memory Bandwidth ------" << std::endl;
Expand All @@ -323,12 +341,105 @@ class App {
const uint32_t RANGE = 128 * 1024 * 1024;
_bandwidth("UBO", RANGE);
}

void shared_mem_bandwidth() {
std::cout << "\n------ Shared Bandwidth ------" << std::endl;
const uint32_t RANGE = max_shared_mem_size_;
_bandwidth("Shared", RANGE);
}

void tex_bandwidth() {
for (int dim = 0; dim < 3; dim++) {
std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
<< std::endl;
const uint32_t MAX_SIZE = dim == 0 ? max_tex_width_
: dim == 1 ? max_tex_height_
: max_tex_depth_;

// rgba, float
const uint32_t VEC_WIDTH = 4;
const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
const uint32_t NVEC = MAX_SIZE;

const uint32_t RANGE = NVEC * VEC_SIZE;

// Cache lines flushed
const uint32_t NFLUSH = 4;
// Number of loop unrolls. Changing this value requires an equal change in
// tex_bandwidth.yaml
const uint32_t NUNROLL = 16;
// Number of iterations. Increasing this value reduces noise in exchange
// for higher latency.
const uint32_t NITER = 10;
// Number of memory reads per thread
const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
// Number of threads needed to read all texells
const uint32_t NTHREAD = NVEC;
// Occupy all threads
const uint32_t local_x = nthread_logic_;
// Ensure that global is a multiple of local, and distribute across all
// SMs
const uint32_t global_x =
(NTHREAD / local_x * local_x) * sm_count_ * NFLUSH;

auto shader_name = "tex_bandwidth_" + std::to_string(dim);

std::vector<int64_t> sizes_whd = {MAX_SIZE, 1, 1};
if (dim == 1) {
sizes_whd = {1, MAX_SIZE, 1};
} else if (dim == 2) {
sizes_whd = {1, 1, MAX_SIZE};
}
auto sizes_nchw = _whd_to_nchw(sizes_whd);

vTensor in_tensor =
api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);

auto bench = [&](uint32_t access_size, uint32_t dim) {
// Number of texels that fit in this iteration
const uint32_t ntexel_access = access_size / VEC_SIZE;

StorageBuffer out_buf(
context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_);
vkapi::PipelineBarrier pipeline_barrier{};

auto time = benchmark_on_gpu(shader_name, 10, [&]() {
context()->submit_compute_job(
VK_KERNEL_FROM_STR(shader_name),
pipeline_barrier,
{global_x, 1, 1},
{local_x, 1, 1},
{SV(NITER), SV(ntexel_access), SV(local_x), SV(dim)},
VK_NULL_HANDLE,
0,
in_tensor.image(),
out_buf.buffer());
});

const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
double gbps = SIZE_TRANS * 1e-3 / time;
std::cout << "Texture bandwidth accessing \t" << access_size
<< "\tB unique data is \t" << gbps << " \tgbps (\t" << time
<< "\tus)" << std::endl;
return gbps;
};

double max_bandwidth = 0;
double min_bandwidth = DBL_MAX;
for (uint32_t access_size = VEC_SIZE; access_size < RANGE;
access_size *= 2) {
double gbps = bench(access_size, dim);
max_bandwidth = std::max(gbps, max_bandwidth);
min_bandwidth = std::min(gbps, min_bandwidth);
}

std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth
<< std::endl;
std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth
<< std::endl;
}
}

// Warp size is a difficult metric to obtain because the hardware limitations
// do not always coincide with the way the SM divides the workload. For
// instance, the hardware can have a warp size of 64 threads, but an SM might
Expand Down Expand Up @@ -492,6 +603,7 @@ int main(int argc, const char** argv) {
app.ubo_bandwidth();
app.shared_mem_bandwidth();
app.warp_size();
app.tex_bandwidth();

return 0;
}

0 comments on commit a5f48c2

Please sign in to comment.