Skip to content

Commit

Permalink
Add metric for 3D texture cache line size (#4421)
Browse files Browse the repository at this point in the history
Summary: Pull Request resolved: #4421

Differential Revision: https://internalfb.com/D60246121
  • Loading branch information
estebanpadilla authored and facebook-github-bot committed Jul 26, 2024
1 parent eb02f6b commit f5683a6
Show file tree
Hide file tree
Showing 3 changed files with 165 additions and 1 deletion.
39 changes: 39 additions & 0 deletions backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.glsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}
#define VEC4_T ${texel_type(DTYPE)}

layout(std430) buffer;

${layout_declare_sampler(0, "r", "in_tex", DTYPE)}
${layout_declare_buffer(1, "w", "out_buf", DTYPE, "PRECISION", False)}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

layout(constant_id = 3) const int niter = 1;

void main() {
vec4 sum = vec4(0);
int i = 0;
for (; i < niter; ++i){
$if DIM == 0:
sum += texelFetch(in_tex, ivec3(gl_GlobalInvocationID[0], 0, 0), 0);
$elif DIM == 1:
sum += texelFetch(in_tex, ivec3(0, gl_GlobalInvocationID[0], 0), 0);
$elif DIM == 2:
sum += texelFetch(in_tex, ivec3(0, 0, gl_GlobalInvocationID[0]), 0);
}

// This is to ensure no compiler optimizations occur
vec4 zero = vec4(i>>31);

out_buf[0] = sum + zero;
}
14 changes: 14 additions & 0 deletions backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_size.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

tex_cacheline_size:
parameter_names_with_default_values:
DTYPE: float
generate_variant_forall:
DIM:
- RANGE: [0, 2]
shader_variants:
- NAME: tex_cacheline_size
113 changes: 112 additions & 1 deletion backends/vulkan/tools/gpuinfo/src/app.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -291,12 +291,122 @@ class App {
if (stride >= MAX_STRIDE) {
std::cout << "Unable to conclude a top level buffer cacheline size."
<< std::endl;
cacheline_size = MAX_STRIDE;
cacheline_size = MAX_STRIDE * sizeof(float);
}

std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
}

// Textures are drastically different from buffers in terms of data layout.
// While buffers are a contiguous range of memory, textures are opaque objects
// defined by the vendor and it is possible that nearby points of data are not
// neighboring in memory. Likewise, data points are accessed in
// multi-dimensional patches instead of simple lines. This makes the stride
// method for figuring out the cache line size not applicable. To go around
// this, this experiment runs an increasing amount of threads accessing
// different datapoints in the texture and measures latency. If the cache line
// is big enough for all threads to access it at the same time, latency will
// be low. When there are more threads than what a single cache line can
// handle, a second line must be fetched, increasing latency in a measurable
// way. With this, we can find the cache line size of all three dimensions.
void tex_cacheline_size() {
if (!_enabled("tex_cacheline_size")) {
std::cout << "Skipped Texture Cacheline Size" << std::endl;
return;
}

const double COMPENSATE = _get_config("tex_cacheline_size", "compensate");
const double THRESHOLD = _get_config("tex_cacheline_size", "threshold");

uint32_t concur_nthread_by_dim[3];

for (int dim = 0; dim < 3; ++dim) {
std::cout << std::endl;
std::cout << "------ Texture Cacheline Size (dim = " << dim << ") ------"
<< std::endl;

uint32_t NITER;

const uint32_t IMG_OTHER_EDGE = dim == 0 ? max_tex_width_
: dim == 1 ? max_tex_height_
: max_tex_depth_;

const uint32_t MAX_NTHREAD = std::min(nthread_logic_, IMG_OTHER_EDGE);

uint32_t& concur_nthread = concur_nthread_by_dim[dim];

auto bench = [&](uint32_t nthread) {
std::vector<int64_t> sizes_whd = {
max_tex_width_, max_tex_height_, max_tex_depth_};

auto sizes_nchw = _whd_to_nchw(sizes_whd);

vTensor in_tensor =
api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);

// Single vec4
StorageBuffer out_buf(context(), vkapi::kFloat, 4);

vkapi::PipelineBarrier pipeline_barrier{};

auto shader_name = "tex_cacheline_size_" + std::to_string(dim);

auto time = benchmark_on_gpu(shader_name, 100, [&]() {
context()->submit_compute_job(
VK_KERNEL_FROM_STR(shader_name),
pipeline_barrier,
{nthread, 1, 1},
{nthread, 1, 1},
{SV(NITER)},
VK_NULL_HANDLE,
0,
in_tensor.image(),
out_buf.buffer());
});
return time;
};

ensure_min_niter(1000, NITER, [&]() { return bench(1); });

DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
uint32_t nthread = 1;
for (; nthread <= MAX_NTHREAD; ++nthread) {
double time = bench(nthread);
std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time
<< std::endl;

if (dj.push(time)) {
concur_nthread = nthread - 1;
std::cout << "Can concurrently access " << concur_nthread
<< "px with " << "minimal cost along dim=" << dim
<< std::endl;
break;
}
}
if (nthread >= MAX_NTHREAD) {
std::cout
<< "Unable to conclude a top level texture cacheline size for dim "
<< dim << std::endl;
} else {
concur_nthread_by_dim[dim] = concur_nthread;
}
}

uint32_t TEXEL_SIZE = 4 * sizeof(float);
const uint32_t concur_nthread_x = concur_nthread_by_dim[0];
const uint32_t concur_nthread_y = concur_nthread_by_dim[1];

uint32_t cacheline_size = TEXEL_SIZE *
std::max(concur_nthread_x, concur_nthread_y) /
std::min(concur_nthread_x, concur_nthread_y);

std::cout << "TextureCachelineSize," << cacheline_size << std::endl;

std::string cacheline_dim;
cacheline_dim = concur_nthread_x >= concur_nthread_y ? "X" : "Y";
std::cout << "TextureCachelineDim," << cacheline_dim << std::endl;
}

private:
void _bandwidth(std::string memtype, uint32_t range) {
auto memtype_lower = memtype;
Expand Down Expand Up @@ -689,6 +799,7 @@ int main(int argc, const char** argv) {
app.shared_mem_bandwidth();
app.warp_size();
app.tex_bandwidth();
app.tex_cacheline_size();

return 0;
}

0 comments on commit f5683a6

Please sign in to comment.