diff --git a/AUTHORS b/AUTHORS index 5e721f8b7..d07fca732 100644 --- a/AUTHORS +++ b/AUTHORS @@ -9,3 +9,4 @@ Cole Ramos Fei Zheng Jose Santos Karl Schultz +Nicholas Curtis \ No newline at end of file diff --git a/README.md b/README.md index 8e10e1c1c..1465cf7a0 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,11 @@ contribution process. * Licensing information can be found in the [LICENSE](LICENSE) file. +## Examples +A set of guided exercises demonstrating kernel optimization using Omniperf can be found in the [amd/HPCTrainingExamples](https://github.com/amd/HPCTrainingExamples/tree/main/OmniperfExamples) repo. + + + ## Development Omniperf follows a diff --git a/sample/common.h b/sample/common.h new file mode 100644 index 000000000..b6edfeab0 --- /dev/null +++ b/sample/common.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include + +#define hipCheck(stmt) \ + do { \ + hipError_t err = stmt; \ + if (err != hipSuccess) { \ + char msg[256]; \ + sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt, __FILE__, \ + __FUNCTION__, __LINE__); \ + std::string errstring = hipGetErrorString(err); \ + std::cerr << msg << "\t" << errstring << std::endl; \ + throw std::runtime_error(msg); \ + } \ + } while (0) diff --git a/sample/fabric.hip b/sample/fabric.hip new file mode 100644 index 000000000..2c1f6b5ff --- /dev/null +++ b/sample/fabric.hip @@ -0,0 +1,315 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el + + + +A data-fabric exerciser example, written by Nicholas Curtis [AMD] + +The test allows the user to control the: + - The granularity of an allocation (Coarse vs Fine-grained), + - The owner of an allocation (local HBM, CPU DRAM or remote HBM), + - The size of an allocation (the default is ~4GiB), and + - The type of operation we are executing (read, write, atomics of various flavors) + +This lets the user explore the impact of these choices on the generated +data-fabric traffic. +*/ + + +#include +#include + +#include +#include + +#include "common.h" + +enum class mtype : int { FineGrained = 0, CoarseGrained = 1, Undef = 3 }; +enum class mowner : int { Device = 0, Host = 1, Remote = 2, Undef = 3 }; +enum class mspace : int { Global = 0, Undef = 1 }; +enum class mop : int { + Read = 0, + Write = 1, + AtomicAdd = 2, + AtomicCas = 3, + AtomicOr = 4, + AtomicMax = 5, + Undef = 6 +}; +enum class mdata : int { Unsigned = 0, UnsignedLong = 1, Float = 2, Double = 3, Undef = 4 }; + +template +T parse(const char* value) { + int ivalue = std::atoi(value); + if (ivalue < 0 || ivalue >= int(T::Undef)) { + throw std::runtime_error("bad enum value!"); + } + return T(ivalue); +} + +void parse(int argc, char** argv, mtype& mytype, mowner& myowner, + mspace& myspace, size_t& size, mop& myop, mdata& mydata, + int& remoteId) { + while (1) { + static struct option long_options[] = { + /* These options set a flag. */ + {"type", required_argument, 0, 't'}, + {"owner", required_argument, 0, 'o'}, + {"size", required_argument, 0, 'z'}, + {"op", required_argument, 0, 'p'}, + {"remote", required_argument, 0, 'r'}, + {"data", required_argument, 0, 'd'}, + {0, 0, 0, 0}}; + /* getopt_long stores the option index here. */ + int option_index = 0; + + int c = + getopt_long(argc, argv, "t:o:z:p:r:d:", long_options, &option_index); + + /* Detect the end of the options. */ + if (c == -1) break; + + switch (c) { + case 't': + mytype = parse(optarg); + break; + + case 'o': + myowner = parse(optarg); + break; + + case 'z': + size = std::atoll(optarg); + break; + + case 'p': + myop = parse(optarg); + break; + + case 'r': + remoteId = std::atoi(optarg); + break; + + case 'd': + mydata = parse(optarg); + break; + + case '?': + /* getopt_long already printed an error message. */ + break; + + default: + abort(); + } + } + std::cout << "Using: " << std::endl; + std::cout << "\tmtype:" + << ((mytype == mtype::FineGrained) ? "FineGrained" + : "CoarseGrained") + << std::endl; + std::cout << "\tmowner:" + << ((myowner == mowner::Device) + ? "Device" + : ((myowner == mowner::Host) ? "Host" : "Remote")) + << std::endl; + std::cout << "\tmspace:Global" << std::endl; + std::cout << "\tmop:" << ((myop == mop::Read) ? "Read" : (myop == mop::Write ? "Write" : (myop == mop::AtomicAdd ? "Add" : (myop == mop::AtomicCas ? "CAS" : (myop == mop::AtomicOr ? "Or" : "Max"))))) << std::endl; + std::cout << "\tmdata:" << (mydata == mdata::Unsigned ? "Unsigned" : (mydata == mdata::UnsignedLong ? "Unsigned Long" : (mydata == mdata::Float ? "Float" : "Double"))) << std::endl; + std::cout << "\tremoteId:" << remoteId << std::endl; +} + +// dummy intialization kernel +__global__ void init() {} + +template +void alloc(mtype memory, mowner owner, T** ptr, size_t Nbytes, int devId, + int remoteId) { + bool is_device = (owner == mowner::Device) || (owner == mowner::Remote); + if (owner == mowner::Remote) { + // enable remote access + hipCheck(hipDeviceEnablePeerAccess(remoteId, 0)); + // set id for alloc + hipCheck(hipSetDevice(remoteId)); + } + init<<<1, 1>>>(); + + if (memory == mtype::FineGrained && is_device) { + hipCheck( + hipExtMallocWithFlags((void**)ptr, Nbytes, hipDeviceMallocFinegrained)); + } else if (memory == mtype::CoarseGrained && is_device) { + hipCheck(hipMalloc(ptr, Nbytes)); + } else if (memory == mtype::FineGrained && owner == mowner::Host) { + hipCheck(hipHostMalloc(ptr, Nbytes, hipHostMallocCoherent)); + } else if (memory == mtype::CoarseGrained && owner == mowner::Host) { + hipCheck(hipHostMalloc(ptr, Nbytes, hipHostMallocNonCoherent)); + } else { + assert(false && "unknown combo"); + } + + // set to random + std::vector host(Nbytes / sizeof(T), T(0)); + hipCheck(hipMemcpy(*ptr, &host[0], Nbytes, + (is_device ? hipMemcpyHostToDevice : hipMemcpyHostToHost))); + + if (owner == mowner::Remote) { + // reset id for execution + hipCheck(hipSetDevice(devId)); + } +} + +template +void release(mtype memory, mowner owner, T* ptr) { + bool is_device = (owner == mowner::Device) || (owner == mowner::Remote); + if (memory == mtype::FineGrained && is_device) { + hipCheck(hipFree(ptr)); + } else if (memory == mtype::CoarseGrained && is_device) { + hipCheck(hipFree(ptr)); + } else if (memory == mtype::FineGrained && owner == mowner::Host) { + hipCheck(hipHostFree(ptr)); + } else if (memory == mtype::CoarseGrained && owner == mowner::Host) { + hipCheck(hipHostFree(ptr)); + } else { + assert(false && "unknown combo"); + } +} + +// the main streaming kernel +template +__global__ void kernel(T* x, size_t N, T zero, T foo) { + int sum = 0; + const size_t offset_start = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = 0; i < repeats; ++i) { + for (size_t offset = offset_start; offset < N; + offset += blockDim.x * gridDim.x) { + T uniq = (foo + offset) + i; + if constexpr (op == mop::Read) { + sum += x[offset]; + } else if constexpr (op == mop::Write) { + x[offset] = (T)offset; + } else if constexpr (op == mop::AtomicAdd) { + atomicAdd(&x[offset], uniq); + } else if constexpr (op == mop::AtomicCas) { + atomicCAS(&x[offset], uniq, uniq); + } else if constexpr (op == mop::AtomicOr) { + atomicOr(&x[offset], uniq); + } else if constexpr (op == mop::AtomicMax) { + atomicMax(&x[offset], uniq); + } + } + } + if constexpr (op == mop::Read) { + if (sum != 0) { + x[offset_start] = sum; + } + } +} + +template +void run_kernel(T* x, size_t size) { + if constexpr (op == mop::AtomicOr && std::is_floating_point_v) { + throw std::runtime_error("bad"); + } else { + kernel<<<4096, 1024>>>(x, size, 0, T(23456789)); + // then run once for data collection + kernel<<<4096, 1024>>>(x, size, 0, T(23456789)); + } +} + +template +void run_atomic(mowner myowner, T* x, size_t size) { + if (myowner == mowner::Host) { + // speed it up + run_kernel(x, size / 10); + } else { + run_kernel(x, size); + } +} + +template +void run(mtype mytype, mspace myspace, mowner myowner, mop myop, int remoteId, + size_t size) { + int devId = 0; + if (myowner == mowner::Remote && remoteId == -1) { + // need to find a remote GPU + int ndevices; + hipCheck(hipGetDeviceCount(&ndevices)); + if (ndevices <= 1) { + throw std::runtime_error( + "Need >=2 devices available for mowner = Remote"); + } + for (int i = 0; i < ndevices; ++i) { + if (i != devId) { + remoteId = i; + break; + } + } + } + + T* x; + alloc(mytype, myowner, &x, size * sizeof(T), devId, remoteId); + + // run the kernel once for warmup + assert(4096 * 1024 < size); + if (myop == mop::Read) { + run_kernel(x, size); + } else if (myop == mop::Write) { + run_kernel(x, size); + } else if (myop == mop::AtomicAdd) { + run_atomic(myowner, x, size); + } else if (myop == mop::AtomicCas) { + run_atomic(myowner, x, size); + } else if (myop == mop::AtomicOr) { + run_atomic(myowner, x, size); + } else if (myop == mop::AtomicMax) { + run_atomic(myowner, x, size); + } else { + throw std::runtime_error("bad"); + } + hipCheck(hipDeviceSynchronize()); + release(mytype, myowner, x); +} + +int main(int argc, char** argv) { + mtype mytype = (mtype)0; + mspace myspace = (mspace)0; + mowner myowner = (mowner)0; + mop myop = (mop)0; + mdata mydata = (mdata)0; + int remoteId = -1; + size_t size = 1024ull * 1024ull * + 1024ull; // 4 GiB, purposefully much larger than caches. + parse(argc, argv, mytype, myowner, myspace, size, myop, mydata, remoteId); + if (mydata == mdata::Unsigned) + run(mytype, myspace, myowner, myop, remoteId, size); + else if (mydata == mdata::UnsignedLong) + run(mytype, myspace, myowner, myop, remoteId, size); + else if (mydata == mdata::Float) + run(mytype, myspace, myowner, myop, remoteId, size); + else if (mydata == mdata::Double) + run(mytype, myspace, myowner, myop, remoteId, size); + else { + throw std::runtime_error("bad"); + } +} \ No newline at end of file diff --git a/sample/instmix.hip b/sample/instmix.hip new file mode 100644 index 000000000..a409db4b0 --- /dev/null +++ b/sample/instmix.hip @@ -0,0 +1,113 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el + + + +A instruction mix exerciser example, written by Gina Sitaraman and Nicholas Curtis [AMD]. +Although inline assembly is inherently unportable, this is expected to work on all CDNA accelerators. +*/ + + +#include "common.h" + +__global__ void kernelasm() { + // int32 + int i, j; + asm volatile("v_add_u32_e32 %0, %1, %0\n" : "=v"(j) : "v"(i)); + + // int 64 + long int l1, l2; + asm volatile("v_cmp_eq_i64 %0, %1\n" : "=v"(l2) : "v"(l1), "v"(i)); + + // fp32: add, mul, transcendental and fma + float f1, f2; + asm volatile( + "v_add_f32_e32 %0, %1, %0\n" + "v_mul_f32_e32 %0, %1, %0\n" + "v_sqrt_f32 %0, %1\n" + "v_fma_f32 %0, %1, %0, %1\n" + : "=v"(f1) + : "v"(f2)); + + // fp64: add, mul, transcendental and fma + double d1, d2, d3, d4; + asm volatile( + "v_add_f64 %0, %1, %0\n" + "v_mul_f64 %0, %1, %0\n" + "v_fma_f64 %0, %1, %0, %1\n" + "v_sqrt_f64 %0, %1\n" + "v_min_f64 %0, %1, %0\n" + : "+v"(d1) + : "v"(d2)); + + // fp16: add, mul, transcendental and fma + _Float16 h1, h2; + asm volatile( + "v_add_f16_e32 %0, %1, %0\n" + "v_mul_f16_e32 %0, %1, %0\n" + "v_sqrt_f16 %0, %1\n" + "v_cvt_f16_f32 %0 %2\n" + "v_fma_f16 %0, %1, %0, %0\n" + : "=v"(h2) + : "v"(h1), "v"(f1)); + + // MFMA ops + double2 dd; + unsigned short us; + long2 ll; +#if defined(__gfx90a__) + asm volatile("v_mfma_f64_4x4x4f64 %0 %1 %2 %3\n" + : "=v"(d4) + : "v"(d1), "v"(d2), "v"(d3)); + asm volatile("v_mfma_f32_16x16x4f32 %0 %1 %2 1\n" + : "=v"(dd) + : "v"(f1), "v"(f2)); + asm volatile("v_mfma_f32_16x16x16f16 %0 %1 %2 1\n" + : "=v"(dd) + : "v"(d1), "v"(d2)); + asm volatile("v_mfma_f32_16x16x8bf16 %0 %1 %2 1\n" + : "=v"(dd) + : "v"(f1), "v"(f2)); + asm volatile("v_mfma_i32_16x16x16i8 %0 %1 %2 1\n" + : "=v"(ll) + : "v"(i), "v"(j)); +#endif + + // Scalar op + asm volatile("s_add_i32 %0 %1 %0\n" : "=s"(j) : "s"(i)); + + // LDS + asm volatile("ds_read_b32 %0 %0\n" : "=v"(i) : "v"(j)); + + // Branch + asm volatile( + "s_branch .LDUMMY\n" + ".LDUMMY:\n" + "s_endpgm\n"); +} +int main() { + kernelasm<<<1, 64>>>(); + hipCheck(hipDeviceSynchronize()); +} diff --git a/sample/ipc.hip b/sample/ipc.hip new file mode 100644 index 000000000..9fcdf462d --- /dev/null +++ b/sample/ipc.hip @@ -0,0 +1,127 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el + + + +An example to explore IPC and divergence, written by Nicholas Curtis [AMD]. +This example may not work on all CDNA accelerators, but has been verified on MI2XX. +*/ + +#include "common.h" + +template +__device__ void vrcp_op() { + int dummy; + if constexpr (N >= 1) { + asm volatile("v_rcp_f64 v[0:1], v[0:1]\n" : : "{v31}"(dummy)); + vrcp_op(); + } +} + +template +__global__ void vrcp() { + vrcp_op(); +} + +template +__device__ void vmov_op() { + int dummy; + if constexpr (N >= 1) { + asm volatile("v_mov_b32 v0, v1\n" : : "{v31}"(dummy)); + vmov_op(); + } +} + +template +__global__ void vmov() { + vmov_op(); +} + +template +__device__ void mfma_op() { + int dummy; + if constexpr (N >= 1) { + asm volatile("v_mfma_f32_32x32x8bf16_1k v[0:15], v[16:17], v[18:19], v[0:15]\n" : : "{v31}"(dummy)); + mfma_op(); + } +} + +template +__global__ void mfma() { + mfma_op(); +} + +template +__device__ void snop_op() { + int dummy; + if constexpr (N >= 1) { + asm volatile("s_nop 0x0\n" : : "{v31}"(dummy)); + snop_op(); + } +} + + +template +__global__ void snop() { + snop_op(); +} + +template +__device__ void smov_op() { + int dummy; + if constexpr (N >= 1) { + asm volatile("s_mov_b32 s0, s1\n" : : "{s31}"(dummy)); + smov_op(); + } +} + +template +__global__ void smov() { + smov_op(); +} + +template +__global__ void vmov_with_divergence() { + if (threadIdx.x % 64 == 0) + vmov_op(); +} + +int main() { + // warmups, spam to all CUs + vrcp<<<1024 * 1024, 1024>>>(); + vmov<<<1024 * 1024, 1024>>>(); + mfma<<<1024 * 1024, 1024>>>(); + snop<<<1024 * 1024, 1024>>>(); + smov<<<1024 * 1024, 1024>>>(); + vmov_with_divergence<<<1024 * 1024, 1024>>>(); + hipCheck(hipDeviceSynchronize()); + vrcp<<<1024 * 1024, 1024>>>(); + vmov<<<1024 * 1024, 1024>>>(); + mfma<<<1024 * 1024, 1024>>>(); + snop<<<1024 * 1024, 1024>>>(); + smov<<<1024 * 1024, 1024>>>(); + vmov_with_divergence<<<1024 * 1024, 1024>>>(); + hipCheck(hipDeviceSynchronize()); +} \ No newline at end of file diff --git a/sample/lds.hip b/sample/lds.hip new file mode 100644 index 000000000..2018ad8da --- /dev/null +++ b/sample/lds.hip @@ -0,0 +1,78 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el + + + +An example to explore LDS bandwidth and bank conflicts, written by Nicholas Curtis [AMD]. +*/ + + +#include "common.h" + +constexpr unsigned max_threads = 256; +constexpr unsigned nbanks = 32; + +__global__ void load(int* out, int flag) { + __shared__ int array[max_threads]; + int index = threadIdx.x; + // fake a store to the LDS array to avoid unwanted behavior + if (flag) + array[max_threads - index] = index; + __syncthreads(); + int x = array[index]; + if (x == int(-1234567)) + out[threadIdx.x] = x; +} + +__global__ void conflicts(int* out, int flag) { + constexpr unsigned nelements = nbanks * max_threads; + __shared__ int array[nelements]; + // each thread reads from the same bank + int index = threadIdx.x * nbanks; + // fake a store to the LDS array to avoid unwanted behavior + if (flag) + array[max_threads - index] = index; + __syncthreads(); + int x = array[index]; + if (x == int(-1234567)) + out[threadIdx.x] = x; +} + +void bandwidth_demo(int N) { + for (int i = 1; i <= N; ++i) + load<<<1,i>>>(nullptr, 0); + hipCheck(hipDeviceSynchronize()); +} + +void conflicts_demo(int N) { + for (int i = 1; i <= N; ++i) + conflicts<<<1,i>>>(nullptr, 0); + hipCheck(hipDeviceSynchronize()); +} + +int main() { + bandwidth_demo(max_threads); + conflicts_demo(max_threads); +} diff --git a/sample/occupancy.hip b/sample/occupancy.hip new file mode 100644 index 000000000..7c7099e30 --- /dev/null +++ b/sample/occupancy.hip @@ -0,0 +1,109 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el + + + +An example to explore achieved occupancy, and various occupancy limiters. +Written by Nicholas Curtis [AMD]. +*/ + + +#include "common.h" + +__global__ void empty(int N, double* ptr) { + +} + +constexpr int bound = 16; +__launch_bounds__(256) +__global__ void vgprbound(int N, double* ptr) { + double intermediates[bound]; + for (int i = 0 ; i < bound; ++i) intermediates[i] = N * threadIdx.x; + double x = ptr[threadIdx.x]; + for (int i = 0; i < 100; ++i) { + x += sin(pow(__shfl(x, i % warpSize) * intermediates[(i - 1) % bound], intermediates[i % bound])); + intermediates[i % bound] = x; + } + if (x == N) ptr[threadIdx.x] = x; +} + +constexpr size_t fully_allocate_lds = 64ul * 1024ul / sizeof(double); +__launch_bounds__(256) +__global__ void ldsbound(int N, double* ptr) { + __shared__ double intermediates[fully_allocate_lds]; + for (int i = threadIdx.x ; i < fully_allocate_lds; i += blockDim.x) intermediates[i] = N * threadIdx.x; + __syncthreads(); + double x = ptr[threadIdx.x]; + for (int i = threadIdx.x; i < fully_allocate_lds; i += blockDim.x) { + x += sin(pow(__shfl(x, i % warpSize) * intermediates[(i - 1) % fully_allocate_lds], intermediates[i % fully_allocate_lds])); + __syncthreads(); + intermediates[i % fully_allocate_lds] = x; + } + if (x == N) ptr[threadIdx.x] = x; +} + +constexpr int sgprlim = 1; +__launch_bounds__(1024, 8) +__global__ void sgprbound(int N, double* ptr) { + double intermediates[sgprlim]; + for (int i = 0 ; i < sgprlim; ++i) intermediates[i] = i; + double x = ptr[0]; + #pragma unroll 1 + for (int i = 0; i < 100; ++i) { + x += sin(pow(intermediates[(i - 1) % sgprlim], intermediates[i % sgprlim])); + intermediates[i % sgprlim] = x; + } + if (x == N) ptr[0] = x; +} + +int main() { + double* ptr; + hipCheck(hipMalloc(&ptr, 1024 * sizeof(double))); + vgprbound<<<1024 * 1024, 256>>>(0, ptr); + hipCheck(hipGetLastError()); + hipCheck(hipDeviceSynchronize()); + vgprbound<<<1024 * 1024, 256>>>(0, ptr); + hipCheck(hipGetLastError()); + hipCheck(hipDeviceSynchronize()); + ldsbound<<<1024 * 1024, 256>>>(0, ptr); + hipCheck(hipGetLastError()); + hipCheck(hipDeviceSynchronize()); + ldsbound<<<1024 * 1024, 256>>>(0, ptr); + hipCheck(hipGetLastError()); + hipCheck(hipDeviceSynchronize()); + sgprbound<<<1024 * 1024, 256>>>(0, ptr); + hipCheck(hipGetLastError()); + hipCheck(hipDeviceSynchronize()); + sgprbound<<<1024 * 1024, 256>>>(0, ptr); + hipCheck(hipGetLastError()); + hipCheck(hipDeviceSynchronize()); + empty<<<1024 * 1024, 256>>>(0, ptr); + hipCheck(hipGetLastError()); + hipCheck(hipDeviceSynchronize()); + empty<<<1024 * 1024, 256>>>(0, ptr); + hipCheck(hipGetLastError()); + hipCheck(hipDeviceSynchronize()); + hipCheck(hipFree(ptr)); +} diff --git a/sample/stack.hip b/sample/stack.hip new file mode 100644 index 000000000..9f030309a --- /dev/null +++ b/sample/stack.hip @@ -0,0 +1,43 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el + + + +An example to explore spill/stack instructions. +Written by Nicholas Curtis [AMD]. +*/ + +#include "common.h" + +__global__ void knl(int* out, int filter) { + int x[1024]; + x[filter] = 0; + if (threadIdx.x < filter) out[threadIdx.x] = x[threadIdx.x]; +} + +int main() { + knl<<<1, 1>>>(nullptr, 0); + hipCheck(hipDeviceSynchronize()); +} \ No newline at end of file diff --git a/sample/vcopy.cpp b/sample/vcopy.cpp index 0eed48711..88fdff22e 100644 --- a/sample/vcopy.cpp +++ b/sample/vcopy.cpp @@ -1,3 +1,29 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el +*/ + #include "hip/hip_runtime.h" #include #include diff --git a/sample/vmem.hip b/sample/vmem.hip new file mode 100644 index 000000000..e85d1baa5 --- /dev/null +++ b/sample/vmem.hip @@ -0,0 +1,98 @@ +/* +##############################################################################bl +# MIT License +# +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +##############################################################################el + + + +An example to explore global/generic instructions. +Written by Nicholas Curtis [AMD]. +*/ + +#include "common.h" + +typedef int __attribute__((address_space(0)))* generic_ptr; + +__attribute__((noinline)) __device__ void generic_store(generic_ptr ptr, int zero) { *ptr = zero; } +__attribute__((noinline)) __device__ int generic_load(generic_ptr ptr) { return *ptr; } +__attribute__((noinline)) __device__ void generic_atomic(generic_ptr ptr, int zero) { atomicAdd((int*)ptr, zero); } + +__global__ void global_write(int* ptr, int zero) { + ptr[threadIdx.x] = zero; +} + +__global__ void generic_write(int* ptr, int zero, int filter) { + __shared__ int lds[1024]; + int* generic = (threadIdx.x < filter) ? &ptr[threadIdx.x] : &lds[threadIdx.x]; + generic_store((generic_ptr)generic, zero); +} + +__global__ void global_read(int* ptr, int zero) { + int x = ptr[threadIdx.x]; + if (x != zero) { + ptr[threadIdx.x] = x + 1; + } +} + +__global__ void generic_read(int* ptr, int zero, int filter) { + __shared__ int lds[1024]; + if (static_cast(filter - 1) == zero) { + lds[threadIdx.x] = 0; // initialize to zero to avoid conditional, but hide behind _another_ conditional + } + int* generic; + if (static_cast(threadIdx.x) > filter - 1) { + generic = &ptr[threadIdx.x]; + } else { + generic = &lds[threadIdx.x]; + abort(); + } + int x = generic_load((generic_ptr)generic); + if (x != zero) { + ptr[threadIdx.x] = x + 1; + } +} + + +__global__ void global_atomic(int* ptr, int zero) { + atomicAdd(ptr, zero); +} + +__global__ void generic_atomic(int* ptr, int filter, int zero) { + __shared__ int lds[1024]; + int* generic = (threadIdx.x % 2 == filter) ? &ptr[threadIdx.x] : &lds[threadIdx.x]; + generic_atomic((generic_ptr)generic, zero); +} + +int main() { + int* ptr; + hipCheck(hipMalloc(&ptr, sizeof(int))); + hipCheck(hipMemset(ptr, 0, sizeof(int))); + global_write<<<1,1>>>(ptr, 0); + generic_write<<<1,1>>>(ptr, 0, 0); + global_read<<<1,1>>>(ptr, 0); + generic_read<<<1,1>>>(ptr, 0, 0); + global_atomic<<<1,1>>>(ptr, 0); + generic_atomic<<<1,1>>>(ptr, 0, 0); + hipCheck(hipDeviceSynchronize()); + hipCheck(hipFree(ptr)); +} diff --git a/src/docs/analysis.md b/src/docs/analysis.md index 9b68249c4..1997563c3 100644 --- a/src/docs/analysis.md +++ b/src/docs/analysis.md @@ -5,7 +5,7 @@ :glob: :maxdepth: 4 ``` -Omniperf offers several ways to interact with the metrics it generates from profiling. The option you choose will likey be influnced by your familiarity with the profiled application, computing enviroment, and experience with Omniperf. +Omniperf offers several ways to interact with the metrics it generates from profiling. The option you choose will likely be influnced by your familiarity with the profiled application, computing enviroment, and experience with Omniperf. While analyzing with the CLI offers quick and straightforward access to Omniperf metrics from terminal, the GUI adds an extra layer of styling and interactiveness some users may prefer. @@ -16,7 +16,7 @@ See sections below for more information on each. ### Features -- All Omniperf built-in metrics. +- All of Omniperf's built-in metrics. - Multiple runs base line comparison. - Metrics customization: pick up subset of build-in metrics or build your own profiling configuration. - Kernel, gpu-id, dispatch-id filters. @@ -107,7 +107,7 @@ Analyze 2. System Speed-of-Light .... ``` - 2. Use `--list-metrics` to generate a list of availible metrics for inspection + 2. Use `--list-metrics` to generate a list of available metrics for inspection ```shell-session $ omniperf analyze -p workloads/vcopy/mi200/ --list-metrics gfx90a ╒═════════╤═════════════════════════════╕ @@ -254,7 +254,7 @@ Analyze ``` > **Note:** Some cells may be blank indicating a missing/unavailable hardware counter or NULL value -3. Optimizatize application, iterate, and re-profile to inspect performance changes. +3. Optimize application, iterate, and re-profile to inspect performance changes. 4. Redo a comprehensive analysis with Omniperf CLI at any milestone or at the end. ### Demo @@ -281,7 +281,7 @@ Analyze $ omniperf analyze -p workloads/vcopy/mi200/ -b 2 5.1.0 ``` - > Note: Users can filter single metric or the whole IP block by its id. In this case, 1 is the id for "system speed of light" and 5.1.0 the id for metric "GPU Busy Cycles". + > Note: Users can filter single metric or the whole hardware component by its id. In this case, 1 is the id for "system speed of light" and 5.1.0 the id for metric "GPU Busy Cycles". - Filter kernels @@ -304,7 +304,7 @@ Analyze ``` - Second, select the index of the kernel you'd like to filter (i.e. __vecCopy(double*, double*, double*, int, int) [clone .kd]__ at index __0__). Then, use this index to apply the filter via `-k/--kernels`. + Second, select the index of the kernel you would like to filter (i.e. __vecCopy(double*, double*, double*, int, int) [clone .kd]__ at index __0__). Then, use this index to apply the filter via `-k/--kernels`. ```shell-session $ omniperf -p workloads/vcopy/mi200/ -k 0 @@ -325,7 +325,7 @@ Analyze ... ... ``` - > Note: You'll see your filtered kernel(s) indicated by a asterisk in the Top Stats table + > Note: You will see your filtered kernel(s) indicated by an asterisk in the Top Stats table - Baseline comparison @@ -333,7 +333,7 @@ Analyze ```shell omniperf analyze -p workload1/path/ -p workload2/path/ ``` - > Note: You can also apply diffrent filters to each workload. + > Note: You can also apply different filters to each workload. OR ```shell @@ -414,7 +414,7 @@ When no filters are applied, users will see five basic sections derived from the To dive deeper, use the top drop down menus to isolate particular kernel(s) or dispatch(s). You will then see the web page update with -metrics specific to the filter you've applied. +metrics specific to the filter you have applied. Once you have applied a filter, you will also see several additional sections become available with detailed metrics specific to that area @@ -427,22 +427,22 @@ interface](https://amdresearch.github.io/omniperf/analysis.html#grafana-based-gu #### Features The Omniperf Grafana GUI Analyzer supports the following features to facilitate MI GPU performance profiling and analysis: -- System and IP-Block Speed-of-Light (SOL) +- System and Hardware Component (IP Block) Speed-of-Light (SOL) - Multiple normalization options, including per-cycle, per-wave, per-kernel and per-second. - Baseline comparisons - Regex based Dispatch ID filtering - Roofline Analysis -- Detailed per IP Block performance counters and metrics - - CPC/CPF - - SPI - - SQ - - SQC - - TA/TD - - TCP - - TCC (both aggregated and per-channel perf info) +- Detailed performance counters and metrics per hardware component, e.g., + - Command Processor - Fetch (CPF) / Command Processor - Controller (CPC) + - Workgroup Manager (SPI) + - Shader Sequencer (SQ) + - Shader Sequencer Controller (SQC) + - L1 Address Processing Unit, a.k.a. Texture Addresser (TA) / L1 Backend Data Processing Unit, a.k.a. Texture Data (TD) + - L1 Cache (TCP) + - L2 Cache (TCC) (both aggregated and per-channel perf info) ##### Speed-of-Light -Speed-of-light panels are provided at both the system and per IP block level to help diagnosis performance bottlenecks. The performance numbers of the workload under testing are compared to the theoretical maximum, (e.g. floating point operations, bandwidth, cache hit rate, etc.), to indicate the available room to further utilize the hardware capability. +Speed-of-light panels are provided at both the system and per hardware component level to help diagnosis performance bottlenecks. The performance numbers of the workload under testing are compared to the theoretical maximum, (e.g. floating point operations, bandwidth, cache hit rate, etc.), to indicate the available room to further utilize the hardware capability. ##### Multi Normalization @@ -457,24 +457,24 @@ Omniperf enables baseline comparison to allow checking A/B effect. The current r For both the Current Workload and the Baseline Workload, one can independently setup the following filters to allow fine grained comparions: - Workload Name -- GPU ID filtering (multi selection) -- Kernel Name filtering (multi selection) +- GPU ID filtering (multi-selection) +- Kernel Name filtering (multi-selection) - Dispatch ID filtering (Regex filtering) -- Omniperf Panels (multi selection) +- Omniperf Panels (multi-selection) ##### Regex based Dispatch ID filtering -This release enables regex based dispatch ID filtering to flexibly choose the kernel invocations. One may refer to [Regex Numeric Range Generator](https://3widgets.com/), to generate typical number ranges. +This release enables Regular Expression (regex), a standard Linux string matching syntax, based dispatch ID filtering to flexibly choose the kernel invocations. One may refer to [Regex Numeric Range Generator](https://3widgets.com/), to generate typical number ranges. -For example, if one wants to inspect Dispatch Range from 17 to 48, inclusive, the corresponding regex is : **(1[7-9]|[23]\d|4[0-8])**. The generated express can be copied over for filtering. +For example, if one wants to inspect Dispatch Range from 17 to 48, inclusive, the corresponding regex is : **(1[7-9]|[23]\d|4[0-8])**. The generated expression can be copied over for filtering. ##### Incremental Profiling Omniperf supports incremental profiling to significantly speed up performance analysis. -> Refer to [*IP Block profiling*](https://amdresearch.github.io/omniperf/profiling.html#ip-block-profiling) section for this command. +> Refer to [*Hardware Component Filtering*](https://amdresearch.github.io/omniperf/profiling.html#hardware-component-filtering) section for this command. -By default, the entire application is profiled to collect perfmon counter for all IP blocks, giving a system level view of where the workload stands in terms of performance optimization opportunities and bottlenecks. +By default, the entire application is profiled to collect performance counters for all hardware blocks, giving a complete view of where the workload stands in terms of performance optimization opportunities and bottlenecks. -After that one may focus on only a few IP blocks, (e.g., L1 Cache or LDS) to closely check the effect of software optimizations, without performing application replay for all other IP Blocks. This saves lots of compute time. In addition, the prior profiling results for other IP blocks are not overwritten. Instead, they can be merged during the import to piece together the system view. +After that one may focus on only a few hardware components, (e.g., L1 Cache or LDS) to closely check the effect of software optimizations, without performing application replay for all other hardware components. This saves lots of compute time. In addition, the prior profiling results for other hardware components are not overwritten. Instead, they can be merged during the import to piece together the system view. ##### Color Coding The uniform color coding is applied to most visualizations (bars, table, diagrams etc). Typically, Yellow color means over 50%, while Red color mean over 90% percent, for easy inspection. @@ -594,7 +594,7 @@ There are currently 18 main panel categories available for analyzing the compute - Command Processor - Command Processor - Fetch (CPF) - Command Processor - Controller (CPC) -- Shader Processing Input (SPI) +- Workgroup Manager or Shader Processor Input (SPI) - SPI Stats - SPI Resource Allocations - Wavefront Launch @@ -655,116 +655,357 @@ There are currently 18 main panel categories available for analyzing the compute - Per-channel L2-EA Read stall (I/O, GMI, HBM) - Per-channel L2-EA Write stall (I/O, GMI, HBM, Starve) -Most panels are designed around a specific IP block to thoroughly understand its behavior. Additional panels, including custom panels, could also be added to aid the performance analysis. +Most panels are designed around a specific hardware component block to thoroughly understand its behavior. Additional panels, including custom panels, could also be added to aid the performance analysis. ##### System Info Panel -![System Info Panel](images/System_info_panel.png) +``` {figure} images/system-info_panel.png +:alt: System Info +:figclass: figure +:align: center + +System details logged from host machine. +``` + ##### Kernel Statistics ###### Kernel Time Histogram -![Kernel Time Histogram](images/Kernel_time_histogram.png) +``` {figure} images/Kernel_time_histogram.png +:alt: Kernel Time Histogram +:figclass: figure +:align: center + +Mapping application kernel launches to execution duration. +``` ###### Top Bottleneck Kernels -![Top Bottleneck Kernels](images/Top_bottleneck_kernels.png) +``` {figure} images/top-stat_panel.png +:alt: Top Bottleneck Kernels +:figclass: figure +:align: center + +Top N kernels and relevant statistics. Sorted by total duration. +``` ###### Top Bottleneck Dispatches -![Top Bottleneck Dispatches](images/Top_bottleneck_dispatches.png) +``` {figure} images/Top_bottleneck_dispatches.png +:alt: Top Bottleneck Dispatches +:figclass: figure +:align: center + +Top N kernel dispatches and relevant statistics. Sorted by total duration. +``` ###### Current and Baseline Dispatch IDs (Filtered) -![Current and Baseline Dispatch IDs](images/Current_and_baseline_dispatch_ids.png) +``` {figure} images/Current_and_baseline_dispatch_ids.png +:alt: Current and Baseline Dispatch IDs +:figclass: figure +:align: center + +List of all kernel dispatches. +``` ##### System Speed-of-Light -![System Speed-of-Light](images/System_speed_of_light.png) +``` {figure} images/sol_panel.png +:alt: System Speed-of-Light +:figclass: figure +:align: center + +Key metrics from various sections of Omniperf’s profiling report. +``` ##### Memory Chart Analysis > Note: The Memory Chart Analysis support multiple normalizations. Due to the space limit, all transactions, when normalized to per-sec, default to unit of Billion transactions per second. -![Memory Chart Analysis](images/Memory_chart_analysis.png) +``` {figure} images/memory-chart_panel.png +:alt: Memory Chart Analysis +:figclass: figure +:align: center + +A graphical representation of performance data for memory blocks on the GPU. +``` + +##### Empirical Roofline Analysis +``` {figure} images/roofline_panel.png +:alt: Roofline Analysis +:figclass: figure +:align: center + +Visualize achieved performance relative to a benchmarked peak performance. +``` -##### Roofline Analysis -![Roofline Analysis](images/Roofline_analysis.png) ##### Command Processor -![Command Processor](images/Command_processor.png) -##### Shader Processing Input (SPI) -![Shader Processing Input](images/Shader_processing_input.png) -##### Wavefront Launch -![Wavefront Launch](images/Wavefront_launch.png) +###### Command Processor Fetcher +``` {figure} images/cpc_panel.png +:alt: Command Processor Fetcher +:figclass: figure +:align: center + +Fetches commands out of memory to hand them over to the Command Processor Fetcher (CPC) for processing +``` +###### Command Processor Compute +``` {figure} images/cpf_panel.png +:alt: Command Processor Compute +:figclass: figure +:align: center + +The micro-controller running the command processing firmware that decodes the fetched commands, and (for kernels) passes them to the Workgroup Managers (SPIs) for scheduling. +``` + +##### Shader Processor Input (SPI) +###### SPI Stats +``` {figure} images/spi-stats_panel.png +:alt: SPI Stats +:figclass: figure +:align: center + +TODO: Add caption after merge +``` +###### SPI Resource Allocation +``` {figure} images/spi-resource-allocation_panel.png +:alt: SPI Resource Allocation +:figclass: figure +:align: center + +TODO: Add caption after merge +``` + +##### Wavefront +###### Wavefront Launch Stats +``` {figure} images/wavefront-launch-stats_panel.png +:alt: Wavefront Launch Stats +:figclass: figure +:align: center + +General information about the kernel launch. +``` +###### Wavefront Runtime Stats +``` {figure} images/wavefront-runtime-stats_panel.png +:alt: Wavefront Runtime Stats +:figclass: figure +:align: center + +High-level overview of the execution of wavefronts in a kernel. +``` ##### Compute Unit - Instruction Mix ###### Instruction Mix -![Instruction Mix](images/Instruction_mix.png) +``` {figure} images/cu-inst-mix_panel.png +:alt: Instruction Mix +:figclass: figure +:align: center + +Breakdown of the various types of instructions executed by the user’s kernel, and which pipelines on the Compute Unit (CU) they were executed on. +``` ###### VALU Arithmetic Instruction Mix -![VALU Arithmetic Instruction Mix](images/VALU_arithmetic_instruction_mix.png) +``` {figure} images/cu-value-arith-instr-mix_panel.png +:alt: VALU Arithmetic Instruction Mix +:figclass: figure +:align: center + +The various types of vector instructions that were issued to the vector arithmetic logic unit (VALU). +``` ###### MFMA Arithmetic Instruction Mix -![MFMA Arithmetic Instruction Mix](images/MFMA_arithmetic_instruction_mix.png) +``` {figure} images/cu-mafma-arith-instr-mix_panel.png +:alt: MFMA Arithmetic Instruction Mix +:figclass: figure +:align: center + +The types of Matrix Fused Multiply-Add (MFMA) instructions that were issued. +``` ###### VMEM Arithmetic Instruction Mix -![VMEM Arithmetic Instruction Mix](images/VMEM_arithmetic_intensity_mix.png) +``` {figure} images/cu-vmem-instr-mix_panel.png +:alt: VMEM Arithmetic Instruction Mix +:figclass: figure +:align: center + +The types of vector memory (VMEM) instructions that were issued. +``` ##### Compute Unit - Compute Pipeline ###### Speed-of-Light -![Speed-of-Light](images/Comp_pipe_sol.png) -###### Compute Pipeline Stats -![Compute Pipeline Stats](images/Compute_pipeline_stats.png) +``` {figure} images/cu-sol_panel.png +:alt: Speed-of-Light +:figclass: figure +:align: center + +The number of floating-point and integer operations executed on the vector arithmetic logic unit (VALU) and Matrix Fused Multiply-Add (MFMA) units in various precisions. +``` +###### Pipeline Stats +``` {figure} images/cu-pipeline-stats_panel.png +:alt: Pipeline Stats +:figclass: figure +:align: center + +More detailed metrics to analyze the several independent pipelines found in the Compute Unit (CU). +``` ###### Arithmetic Operations -![Arithmetic Operations](images/Arithmetic_operations.png) -###### Memory Latencies -![Memory Latencies](images/Memory_latencies.png) +``` {figure} images/cu-arith-ops_panel.png +:alt: Arithmetic Operations +:figclass: figure +:align: center + +The total number of floating-point and integer operations executed in various precisions. +``` ##### Local Data Share (LDS) ###### Speed-of-Light -![Speed-of-Light](images/LDS_sol.png) +``` {figure} images/lds-sol_panel.png +:alt: Speed-of-Light +:figclass: figure +:align: center + +Key metrics for the Local Data Share (LDS) as a comparison with the peak achievable values of those metrics. +``` ###### LDS Stats -![LDS Stats](images/LDS_stats.png) +``` {figure} images/lds-stats_panel.png +:alt: LDS Stats +:figclass: figure +:align: center + +More detailed view of the Local Data Share (LDS) performance. +``` ##### Instruction Cache ###### Speed-of-Light -![Speed-of-Light](images/Instruc_cache_sol.png) +``` {figure} images/instr-cache-sol_panel.png +:alt: Speed-of-Light +:figclass: figure +:align: center + +Key metrics of the L1 Instruction (L1I) cache as a comparison with the peak achievable values of those metrics. +``` ###### Instruction Cache Stats -![Instruction Cache Stats](images/Instruction_cache_stats.png) +``` {figure} images/instr-cache-accesses_panel.png +:alt: Instruction Cache Stats +:figclass: figure +:align: center + +More detail on the hit/miss statistics of the L1 Instruction (L1I) cache. +``` ##### Scalar L1D Cache ###### Speed-of-Light -![](images/L1D_sol.png) -###### Constant Cache Stats -![Constant Cache Stats](images/Vec_L1D_cache_accesses.png) -###### Constant Cache - L2 Interface -![Constant Cache - L2 Interface](images/Constant_cache_l2_interface.png) +``` {figure} images/sl1d-sol_panel.png +:alt: Speed-of-Light +:figclass: figure +:align: center + +Key metrics of the Scalar L1 Data (sL1D) cache as a comparison with the peak achievable values of those metrics. +``` +###### Scalar L1D Cache Accesses +``` {figure} images/sl1d-cache-accesses_panel.png +:alt: Scalar L1D Cache Accesses +:figclass: figure +:align: center + +More detail on the types of accesses made to the Scalar L1 Data (sL1D) cache, and the hit/miss statistics. +``` +###### Scalar L1D Cache - L2 Interface +``` {figure} images/sl1d-l12-interface_panel.png +:alt: Scalar L1D Cache - L2 Interface +:figclass: figure +:align: center + +More detail on the data requested across the Scalar L1 Data (sL1D) cache <-> L2 interface. +``` ##### Texture Address and Texture Data -###### Texture Address (TA) -![Texture Address](images/Texture_address.png) -###### Texture Data (TD) -![Texture Data](images/Texture_data.png) +###### Texture Addresser +``` {figure} images/ta_panel.png +:alt: Texture Addresser +:figclass: figure +:align: center + +Metric specific to texture addresser (TA) which receives commands (e.g., instructions) and write/atomic data from the Compute Unit (CU), and coalesces them into fewer requests for the cache to process. +``` +###### Texture Data +``` {figure} images/td_panel.png +:alt: Texture Data +:figclass: figure +:align: center + +Metrics specific to texture data (TD) which routes data back to the requesting Compute Unit (CU). +``` -##### Vector L1D Cache +##### Vector L1 Data Cache ###### Speed-of-Light -![Speed-of-Light](images/Vec_L1D_cache_sol.png) -###### Vector L1D Cache Accesses -![Vector L1D Cache Accesses](images/Vec_L1D_cache_accesses.png) -###### L1 Cache Stalls -![L1 Cache Stalls](images/L1_cache_stalls.png) -###### L1 - L2 Transactions -![L1 - L2 Transactions](images/L1_l2_transactions.png) -###### L1 - UTCL1 Interface Stats -![L1 - UTCL1 Interface Stats](images/L1_utcl1_transactions.png) +``` {figure} images/vl1d-sol_panel.png +:alt: Speed-of-Light +:figclass: figure +:align: center + +Key metrics of the vector L1 data (vL1D) cache as a comparison with the peak achievable values of those metrics. +``` +###### L1D Cache Stalls +``` {figure} images/vl1d-cache-stalls_panel.png +:alt: L1D Cache Stalls +:figclass: figure +:align: center + +More detail on where vector L1 data (vL1D) cache is stalled in the pipeline, which may indicate performance limiters of the cache. +``` +###### L1D Cache Accesses +``` {figure} images/vl1d-cache-accesses_panel.png +:alt: L1D Cache Accesses +:figclass: figure +:align: center + +The type of requests incoming from the cache frontend, the number of requests that were serviced by the vector L1 data (vL1D) cache, and the number & type of outgoing requests to the L2 cache. +``` +###### L1D - L2 Transactions +``` {figure} images/vl1d-l2-transactions_panel.png +:alt: L1D - L2 Transactions +:figclass: figure +:align: center + +A more granular look at the types of requests made to the L2 cache. +``` +###### L1D Addr Translation +``` {figure} images/vl1d-addr-translation_panel.png +:alt: L1D Addr Translation +:figclass: figure +:align: center + +After a vector memory instruction has been processed/coalesced by the address processing unit of the vector L1 data (vL1D) cache, it must be translated from a virtual to physical address. These metrics provide more details on the L1 Translation Lookaside Buffer (TLB) which handles this process. +``` ##### L2 Cache ###### Speed-of-Light -![Speed-of-Light](images/L2_cache_sol.png) +``` {figure} images/l2-sol_panel.png +:alt: Speed-of-Light +:figclass: figure +:align: center + +Key metrics about the performance of the L2 cache, aggregated over all the L2 channels, as a comparison with the peak achievable values of those metrics. +``` ###### L2 Cache Accesses -![L2 Cache Accesses](images/L2_cache_accesses.png) -###### L2 - EA Transactions -![L2 - EA Transactions](images/L2_ea_transactions.png) -###### L2 - EA Stalls -![L2 - EA Stalls](images/L2_ea_stalls.png) - -##### L2 Cache Per Channel Performance -###### L1-L2 Transactions -![L1-L2 Transactions](images/L1_l2_transactions_per_channel.png) -###### L2-EA Transactions -![L2-EA Transactions](images/L2_ea_transactions_per_channel.png) -###### L2-EA Latencies -![L2-EA Latencies](images/L2_ea_latencies_per_channel.png) -###### L2-EA Stalls -![L2-EA Stalls](images/L2_ea_stalls_per_channel.png) -###### L2-EA Write Stalls -![L2-EA Write Stalls](images/L2_ea_write_stalls_per_channel.png) -###### L2-EA Write Starvation -![L2-EA Write Starvation](images/L2_ea_write_starvation_per_channel.png) \ No newline at end of file +``` {figure} images/l2-accesses_panel.png +:alt: L2 Cache Accesses +:figclass: figure +:align: center + +Incoming requests to the L2 cache from the vector L1 data (vL1D) cache and other clients (e.g., the sL1D and L1I caches). +``` +###### L2 - Fabric Transactions +``` {figure} images/l2-fabric-transactions_panel.png +:alt: L2 - Fabric Transactions +:figclass: figure +:align: center + +More detail on the flow of requests through Infinity Fabric™. +``` +###### L2 - Fabric Interface Stalls +``` {figure} images/l2-fabric-interface-stalls_panel.png +:alt: L2 - Fabric Interface Stalls +:figclass: figure +:align: center + +A breakdown of what types of requests in a kernel caused a stall (e.g., read vs write), and to which locations (e.g., to the accelerator’s local memory, or to remote accelerators/CPUs). +``` + +##### L2 Cache Per Channel +###### Aggregate Stats +``` {figure} images/l2-per-channel-agg-stats_panel.png +:alt: Aggregate Stats +:figclass: figure +:align: center + +L2 Cache per channel performance at a glance. Metrics are aggregated over all available channels. +``` diff --git a/src/docs/conf.py b/src/docs/conf.py index af0003fb7..f1f26ff80 100644 --- a/src/docs/conf.py +++ b/src/docs/conf.py @@ -32,13 +32,13 @@ def install(package): # -- Project information ----------------------------------------------------- project = "Omniperf" -copyright = "2022, Audacious Software Group" +copyright = "2023-2024, Audacious Software Group" author = "Audacious Software Group" # The short X.Y version version = repo_version # The full version, including alpha/beta/rc tags -release = "" +release = repo_version # -- General configuration --------------------------------------------------- @@ -52,9 +52,12 @@ def install(package): "myst_parser", ] -myst_heading_anchors = 2 +show_authors = True + +myst_heading_anchors = 4 # enable replacement of (tm) & friends -myst_enable_extensions = ["replacements"] +myst_enable_extensions = ["replacements", "dollarmath"] + # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -112,6 +115,10 @@ def install(package): # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] +latex_elements = { + "sphinxsetup": 'verbatimwrapslines=true, verbatimforcewraps=true', +} + # -- Options for HTMLHelp output --------------------------------------------- @@ -130,7 +137,7 @@ def install(package): # Toc options "collapse_navigation": True, "sticky_navigation": True, - "navigation_depth": 4, + "navigation_depth": 5, "includehidden": True, "titles_only": False, } @@ -162,6 +169,7 @@ def setup(app): app.add_transform(AutoStructify) app.add_config_value("docstring_replacements", {}, True) app.connect("source-read", replaceString) + app.add_css_file("css/custom.css") # function to replace version string througout documentation diff --git a/src/docs/faq.md b/src/docs/faq.md index 6a996cc27..c5450532e 100644 --- a/src/docs/faq.md +++ b/src/docs/faq.md @@ -6,7 +6,7 @@ :maxdepth: 4 ``` -**1. How do I export profiling data I've already generated using Omniperf?** +**1. How do I export profiling data I have already generated using Omniperf?** In order to interact with the Grafana GUI you must sync data with the MongoDB backend. This interaction is done through ***database*** mode. @@ -35,11 +35,23 @@ $ export LANG=C.UTF-8 1. Open MobaXterm 2. In the top ribbon, select `Tunneling` -![Tunnel Button](images/tunnel_demo1.png) +``` {image} images/tunnel_demo1.png +:alt: MobaXterm Tunnel Button +:class: bg-primary +:align: center +``` This pop up will appear -![Pop up](images/tunnel_demo2.png) +``` {image} images/tunnel_demo2.png +:alt: MobaXterm Pop Up +:class: bg-primary +:align: center +``` 3. Press `New SSH tunnel` -![Pop up](images/tunnel_demo3.png) +``` {image} images/tunnel_demo3.png +:alt: MobaXterm Pop Up +:class: bg-primary +:align: center +``` 4. Configure tunnel accordingly Local clients @@ -52,4 +64,4 @@ This pop up will appear SSH Server - SSH server: Name of the server one is connecting to - SSH login: Username to login to the server - - SSH port: 22 \ No newline at end of file + - SSH port: 22 diff --git a/src/docs/getting_started.md b/src/docs/getting_started.md index 80ae888f0..b841fb063 100644 --- a/src/docs/getting_started.md +++ b/src/docs/getting_started.md @@ -10,13 +10,13 @@ 1. **Launch & Profile the target application with the command line profiler** - The command line profiler launches the target application, calls the rocProfiler API, and collects profile results for the specified kernels, dispatches, and/or IP blocks. If not specified, Omniperf will default to collecting all available counters for all kernels/dispatches launched by the user's executable. + The command line profiler launches the target application, calls the rocProfiler API via the rocProf binary, and collects profile results for the specified kernels, dispatches, and/or hardware components. If not specified, Omniperf will default to collecting all available counters for all kernels/dispatches launched by the user's executable. To collect the default set of data for all kernels in the target application, launch, e.g.: ```shell $ omniperf profile -n vcopy_data -- ./vcopy 1048576 256 ``` - The app runs, each kernel is launched, and profiling results are generated. By default, results are written to (e.g.,) ./workloads/vcopy_data (configurable via the `-n` argument). To collect all requested profile information, it may be required to replay kernels multiple times. + The app runs, each kernel is launched, and profiling results are generated. By default, results are written to e.g., ./workloads/vcopy_data (configurable via the `-n` argument). To collect all requested profile information, it may be required to replay kernels multiple times. 2. **Customize data collection** @@ -25,19 +25,20 @@ Some common filters include: - - `-k`/`--kernel` enables filtering kernels by name. `-d`/`--dispatch` enables filtering based on dispatch ID - - `-b`/`--ipblocks` enables collects metrics for only the specified (one or more) IP Blocks. + - `-k`/`--kernel` enables filtering kernels by name. + - `-d`/`--dispatch` enables filtering based on dispatch ID. + - `-b`/`--ipblocks` enables collects metrics for only the specified (one or more) hardware component blocks. - To view available metrics by IP Block you can use the `--list-metrics` argument to view a list of all available metrics organized by IP Block. + To view available metrics by IP Block you can use the `--list-metrics` argument: ```shell $ omniperf analyze --list-metrics ``` 3. **Analyze at the command line** - After generating a local output folder (./workloads/\), the command line tool can also be used to quickly interface with profiling results. View different metrics derived from your profiled results and get immediate access all metrics organized by IP block. + After generating a local output folder (./workloads/\), the command line tool can also be used to quickly interface with profiling results. View different metrics derived from your profiled results and get immediate access all metrics organized by IP blocks. - If no kernel, dispatch, or ipblock filters are applied at this stage, analysis will be reflective of the entirety of the profiling data. + If no kernel, dispatch, or hardware block filters are applied at this stage, analysis will be reflective of the entirety of the profiling data. To interact with profiling results from a different session, users just provide the workload path. `-p`/`--path` enables users to analyze existing profiling data in the Omniperf CLI. @@ -55,7 +56,7 @@ ### Modes Modes change the fundamental behavior of the Omniperf command line tool. Depending on which mode is chosen, different command line options become available. -- **Profile**: Target application is launched on the local system utilizing AMD’s [ROC Profiler](https://github.com/ROCm-Developer-Tools/rocprofiler). Depending on the profiling options chosen, selected kernels, dispatches, and/or IP Blocks in the application are profiled and results are stored locally in an output folder (./workloads/\). +- **Profile**: Target application is launched on the local system using AMD’s [ROC Profiler](https://github.com/ROCm-Developer-Tools/rocprofiler). Depending on the profiling options chosen, selected kernels, dispatches, and/or hardware components in the application are profiled and results are stored locally in an output folder (./workloads/\). ```shell $ omniperf profile --help @@ -65,7 +66,7 @@ Modes change the fundamental behavior of the Omniperf command line tool. Dependi To gererate a lightweight GUI interface users can add the `--gui` flag to their analysis command. - This mode is designed to be a middle ground to the highly detailed Omniperf Grafana GUI and is great for users who want immediate access to an IP Block they’re already familiar with. + This mode is designed to be a middle ground to the highly detailed Omniperf Grafana GUI and is great for users who want immediate access to a hardware component they’re already familiar with. ```shell $ omniperf analyze --help @@ -90,4 +91,4 @@ Standalone roofline analysis | profile | `--name`, `--roof-only`, `-- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Total Fabric Requests + + + + + 32B Read Requests + + + + 64B Read Requests + + + + 32B Write Requests + + + + + + 64B Write Requests + + + + + + Uncached Read Requests + + + x2 + + + + Uncached Write Requests + + + + + + Atomic +Requests + + + + + + HBM Read +Requests + + + + + Remote Read +Requests + + + + + + + + + + + + + + + + + + + HBM Write Requests + + + + Remote Write Requests + + + + diff --git a/src/docs/images/fig_level_counter.png b/src/docs/images/fig_level_counter.png new file mode 100755 index 000000000..fa50539a0 Binary files /dev/null and b/src/docs/images/fig_level_counter.png differ diff --git a/src/docs/images/gcn_compute_unit.png b/src/docs/images/gcn_compute_unit.png new file mode 100644 index 000000000..e6c1f2eb0 Binary files /dev/null and b/src/docs/images/gcn_compute_unit.png differ diff --git a/src/docs/images/instr-cache-accesses_panel.png b/src/docs/images/instr-cache-accesses_panel.png new file mode 100644 index 000000000..926a7805e Binary files /dev/null and b/src/docs/images/instr-cache-accesses_panel.png differ diff --git a/src/docs/images/instr-cache-sol_panel.png b/src/docs/images/instr-cache-sol_panel.png new file mode 100644 index 000000000..64be7178c Binary files /dev/null and b/src/docs/images/instr-cache-sol_panel.png differ diff --git a/src/docs/images/l1perf_model.png b/src/docs/images/l1perf_model.png new file mode 100644 index 000000000..fdabfbb95 Binary files /dev/null and b/src/docs/images/l1perf_model.png differ diff --git a/src/docs/images/l1perf_model.svg b/src/docs/images/l1perf_model.svg new file mode 100644 index 000000000..dd22a7131 --- /dev/null +++ b/src/docs/images/l1perf_model.svg @@ -0,0 +1,584 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + Compute Unit + Cmd/Data + + + + Address Processing Unit + + + Sync + Data Processing Unit + + Virtual To Physical Address Translation + + Tag RAM + + L1 Cache Controller + + CacheRAM + + L2 Memory Interface + Data + + Bus + + L2 Cache + + + + diff --git a/src/docs/images/l2-accesses_panel.png b/src/docs/images/l2-accesses_panel.png new file mode 100644 index 000000000..101cf7753 Binary files /dev/null and b/src/docs/images/l2-accesses_panel.png differ diff --git a/src/docs/images/l2-fabric-interface-stalls_panel.png b/src/docs/images/l2-fabric-interface-stalls_panel.png new file mode 100644 index 000000000..b1bd415ca Binary files /dev/null and b/src/docs/images/l2-fabric-interface-stalls_panel.png differ diff --git a/src/docs/images/l2-fabric-transactions_panel.png b/src/docs/images/l2-fabric-transactions_panel.png new file mode 100644 index 000000000..7df5a7809 Binary files /dev/null and b/src/docs/images/l2-fabric-transactions_panel.png differ diff --git a/src/docs/images/l2-per-channel-agg-stats_panel.png b/src/docs/images/l2-per-channel-agg-stats_panel.png new file mode 100644 index 000000000..704d45c69 Binary files /dev/null and b/src/docs/images/l2-per-channel-agg-stats_panel.png differ diff --git a/src/docs/images/l2-sol_panel.png b/src/docs/images/l2-sol_panel.png new file mode 100644 index 000000000..646e608cb Binary files /dev/null and b/src/docs/images/l2-sol_panel.png differ diff --git a/src/docs/images/lds-sol_panel.png b/src/docs/images/lds-sol_panel.png new file mode 100644 index 000000000..c261513aa Binary files /dev/null and b/src/docs/images/lds-sol_panel.png differ diff --git a/src/docs/images/lds-stats_panel.png b/src/docs/images/lds-stats_panel.png new file mode 100644 index 000000000..0d9d419eb Binary files /dev/null and b/src/docs/images/lds-stats_panel.png differ diff --git a/src/docs/images/lds.png b/src/docs/images/lds.png new file mode 100644 index 000000000..f444eaf53 Binary files /dev/null and b/src/docs/images/lds.png differ diff --git a/src/docs/images/lds.svg b/src/docs/images/lds.svg new file mode 100644 index 000000000..c0adb5e91 --- /dev/null +++ b/src/docs/images/lds.svg @@ -0,0 +1,393 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SIMD 0/1 + SIMD 2/3 + + + + + + Conflict Detection + + + + Scheduler + + + + Bank 0 + + + + Bank 1 + + + + Bank 2 + + + + Bank 3 + + + + Bank 31 + + ... + + diff --git a/src/docs/images/ldsbandwidth.png b/src/docs/images/ldsbandwidth.png new file mode 100644 index 000000000..bd74d6249 Binary files /dev/null and b/src/docs/images/ldsbandwidth.png differ diff --git a/src/docs/images/ldsbandwidth.svg b/src/docs/images/ldsbandwidth.svg new file mode 100644 index 000000000..a854f697d --- /dev/null +++ b/src/docs/images/ldsbandwidth.svg @@ -0,0 +1,1579 @@ + + + + + + + + 2023-08-21T11:00:20.650499 + image/svg+xml + + + Matplotlib v3.7.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/docs/images/ldsconflictrate.png b/src/docs/images/ldsconflictrate.png new file mode 100644 index 000000000..ab057f3cd Binary files /dev/null and b/src/docs/images/ldsconflictrate.png differ diff --git a/src/docs/images/ldsconflictrate.svg b/src/docs/images/ldsconflictrate.svg new file mode 100644 index 000000000..f98e9bc4a --- /dev/null +++ b/src/docs/images/ldsconflictrate.svg @@ -0,0 +1,1050 @@ + + + + + + + + 2023-08-21T11:43:04.336525 + image/svg+xml + + + Matplotlib v3.7.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/docs/images/ldsconflicts.png b/src/docs/images/ldsconflicts.png new file mode 100644 index 000000000..77c093858 Binary files /dev/null and b/src/docs/images/ldsconflicts.png differ diff --git a/src/docs/images/ldsconflicts.svg b/src/docs/images/ldsconflicts.svg new file mode 100644 index 000000000..f4a2f17d1 --- /dev/null +++ b/src/docs/images/ldsconflicts.svg @@ -0,0 +1,1145 @@ + + + + + + + + 2023-08-17T18:14:36.907658 + image/svg+xml + + + Matplotlib v3.7.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/docs/images/memory-chart_panel.png b/src/docs/images/memory-chart_panel.png new file mode 100644 index 000000000..1091a5032 Binary files /dev/null and b/src/docs/images/memory-chart_panel.png differ diff --git a/src/docs/images/nosplit.png b/src/docs/images/nosplit.png new file mode 100644 index 000000000..a8e5f0164 Binary files /dev/null and b/src/docs/images/nosplit.png differ diff --git a/src/docs/images/nosplit.svg b/src/docs/images/nosplit.svg new file mode 100644 index 000000000..d0d9606be --- /dev/null +++ b/src/docs/images/nosplit.svg @@ -0,0 +1,71 @@ + + + + + + + + + + + + diff --git a/src/docs/images/roofline_panel.png b/src/docs/images/roofline_panel.png new file mode 100644 index 000000000..47ee9bddb Binary files /dev/null and b/src/docs/images/roofline_panel.png differ diff --git a/src/docs/images/selayout.png b/src/docs/images/selayout.png new file mode 100644 index 000000000..73aa2b49d Binary files /dev/null and b/src/docs/images/selayout.png differ diff --git a/src/docs/images/sl1d-cache-accesses_panel.png b/src/docs/images/sl1d-cache-accesses_panel.png new file mode 100644 index 000000000..3605cce8a Binary files /dev/null and b/src/docs/images/sl1d-cache-accesses_panel.png differ diff --git a/src/docs/images/sl1d-l12-interface_panel.png b/src/docs/images/sl1d-l12-interface_panel.png new file mode 100644 index 000000000..5c3480ac9 Binary files /dev/null and b/src/docs/images/sl1d-l12-interface_panel.png differ diff --git a/src/docs/images/sl1d-sol_panel.png b/src/docs/images/sl1d-sol_panel.png new file mode 100644 index 000000000..92fa5a1a4 Binary files /dev/null and b/src/docs/images/sl1d-sol_panel.png differ diff --git a/src/docs/images/sol_panel.png b/src/docs/images/sol_panel.png new file mode 100644 index 000000000..f456500e0 Binary files /dev/null and b/src/docs/images/sol_panel.png differ diff --git a/src/docs/images/spi-resource-allocation_panel.png b/src/docs/images/spi-resource-allocation_panel.png new file mode 100644 index 000000000..bee869ad1 Binary files /dev/null and b/src/docs/images/spi-resource-allocation_panel.png differ diff --git a/src/docs/images/spi-stats_panel.png b/src/docs/images/spi-stats_panel.png new file mode 100644 index 000000000..19c7ad364 Binary files /dev/null and b/src/docs/images/spi-stats_panel.png differ diff --git a/src/docs/images/split.png b/src/docs/images/split.png new file mode 100644 index 000000000..cca71eb2a Binary files /dev/null and b/src/docs/images/split.png differ diff --git a/src/docs/images/split.svg b/src/docs/images/split.svg new file mode 100644 index 000000000..b033a9e11 --- /dev/null +++ b/src/docs/images/split.svg @@ -0,0 +1,64 @@ + + + + + + + + + + + diff --git a/src/docs/images/system-info_panel.png b/src/docs/images/system-info_panel.png new file mode 100644 index 000000000..5a5fa0118 Binary files /dev/null and b/src/docs/images/system-info_panel.png differ diff --git a/src/docs/images/ta_panel.png b/src/docs/images/ta_panel.png new file mode 100644 index 000000000..2f08f9a6b Binary files /dev/null and b/src/docs/images/ta_panel.png differ diff --git a/src/docs/images/td_panel.png b/src/docs/images/td_panel.png new file mode 100644 index 000000000..819407515 Binary files /dev/null and b/src/docs/images/td_panel.png differ diff --git a/src/docs/images/top-stat_panel.png b/src/docs/images/top-stat_panel.png new file mode 100644 index 000000000..5e3dddca2 Binary files /dev/null and b/src/docs/images/top-stat_panel.png differ diff --git a/src/docs/images/uncached.png b/src/docs/images/uncached.png new file mode 100644 index 000000000..f770a1b29 Binary files /dev/null and b/src/docs/images/uncached.png differ diff --git a/src/docs/images/uncached.svg b/src/docs/images/uncached.svg new file mode 100644 index 000000000..53affd4fc --- /dev/null +++ b/src/docs/images/uncached.svg @@ -0,0 +1,125 @@ + + + + + + + + + + + + + + + + + + + + + + x2 + + diff --git a/src/docs/images/vl1d-addr-translation_panel.png b/src/docs/images/vl1d-addr-translation_panel.png new file mode 100644 index 000000000..0fb4aaf07 Binary files /dev/null and b/src/docs/images/vl1d-addr-translation_panel.png differ diff --git a/src/docs/images/vl1d-cache-accesses_panel.png b/src/docs/images/vl1d-cache-accesses_panel.png new file mode 100644 index 000000000..5259b2214 Binary files /dev/null and b/src/docs/images/vl1d-cache-accesses_panel.png differ diff --git a/src/docs/images/vl1d-cache-stalls_panel.png b/src/docs/images/vl1d-cache-stalls_panel.png new file mode 100644 index 000000000..61e09c915 Binary files /dev/null and b/src/docs/images/vl1d-cache-stalls_panel.png differ diff --git a/src/docs/images/vl1d-l2-transactions_panel.png b/src/docs/images/vl1d-l2-transactions_panel.png new file mode 100644 index 000000000..51875e516 Binary files /dev/null and b/src/docs/images/vl1d-l2-transactions_panel.png differ diff --git a/src/docs/images/vl1d-sol_panel.png b/src/docs/images/vl1d-sol_panel.png new file mode 100644 index 000000000..5c2485d0d Binary files /dev/null and b/src/docs/images/vl1d-sol_panel.png differ diff --git a/src/docs/images/wavefront-launch-stats_panel.png b/src/docs/images/wavefront-launch-stats_panel.png new file mode 100644 index 000000000..38e4517f3 Binary files /dev/null and b/src/docs/images/wavefront-launch-stats_panel.png differ diff --git a/src/docs/images/wavefront-runtime-stats_panel.png b/src/docs/images/wavefront-runtime-stats_panel.png new file mode 100644 index 000000000..517d461d3 Binary files /dev/null and b/src/docs/images/wavefront-runtime-stats_panel.png differ diff --git a/src/docs/index.md b/src/docs/index.md index 931718107..4f3f7c107 100644 --- a/src/docs/index.md +++ b/src/docs/index.md @@ -12,5 +12,6 @@ getting_started profiling analysis + performance_model faq ``` diff --git a/src/docs/installation.md b/src/docs/installation.md index af8c21bac..caf3e5cd2 100644 --- a/src/docs/installation.md +++ b/src/docs/installation.md @@ -17,7 +17,7 @@ Omniperf is broken into two installation components: - Mongo DB backend + Grafana instance - Packaged in a Docker container for easy setup -Determine what you need to install based on how you'd like to interact with Omniperf. See the decision tree below to help determine what installation is right for you. +Determine what you need to install based on how you would like to interact with Omniperf. See the decision tree below to help determine what installation is right for you. ![Omniperf Installtion Decision Tree](images/install_decision_tree.png) @@ -55,14 +55,14 @@ available from the of the Omniperf development site. From there, untar and descend into the top-level directory as follows: -```shell +```shell-session $ tar xfz omniperf-v{__VERSION__}.tar.gz $ cd omniperf-v{__VERSION__} ``` Next, install Python dependencies and complete the Omniperf configuration/install process as follows: -```shell +```shell-session # define top-level install path $ export INSTALL_DIR= @@ -87,7 +87,7 @@ do not have write access to the chosen install path. After completing these steps, a successful top-level installation directory looks as follows: -```shell +```shell-session $ ls $INSTALL_DIR modulefiles {__VERSION__} python-libs ``` @@ -102,7 +102,7 @@ follows: -```shell +```shell-session $ module use $INSTALL_DIR/modulefiles $ module load omniperf $ which omniperf @@ -125,14 +125,14 @@ To use Omniperf without the companion modulefile, update your `PATH` settings to enable access to the command-line binary. If you installed Python dependencies in a shared location, update your `PYTHONPATH` config as well: -```shell +```shell-session export PATH=$INSTALL_DIR/{__VERSION__}/bin:$PATH export PYTHONPATH=$INSTALL_DIR/python-libs ``` ### rocProf -Omniperf relies on a rocprof binary during the profiling +Omniperf relies on a rocProf binary during the profiling process. Normally the path to this binary will be detected automatically, but it can also be overridden via the setting the optional `ROCPROF` environment variable to the path of the binary the user @@ -162,9 +162,9 @@ Omniperf server-side requires the following basic software dependencies prior to The recommended process for enabling the server-side of Omniperf is to use the provided Docker file to build the Grafana and MongoDB instance. -Once you've decided which machine you'd like to use to host the Grafana and MongoDB instance, please follow the set up instructions below. +Once you have decided which machine you would like to use to host the Grafana and MongoDB instance, please follow the set up instructions below. -### 1) Install MongoDB Utils +### Install MongoDB Utils Omniperf uses [mongoimport](https://www.mongodb.com/docs/database-tools/mongoimport/) to upload data to Grafana's backend database. Install for Ubuntu 20.04 is as follows: ```bash @@ -173,7 +173,7 @@ $ sudo apt install ./mongodb-database-tools-ubuntu2004-x86_64-100.6.1.deb ``` > Installation instructions for alternative distributions can be found [here](https://www.mongodb.com/download-center/database-tools/releases/archive) -### 2) Persistent Storage +### Persistent Storage The user will also bind MongoDB to a directory on the host OS to create a local backup in case of a crash or reset. In the Docker world, this is known as "creating a persistent volume": @@ -184,24 +184,24 @@ $ sudo docker volume create --driver local --opt type=none --opt device=/usr/loc $ sudo docker volume create --driver local --opt type=none --opt device=/usr/local/persist/mongodb --opt o=bind grafana-mongo-db ``` -### 3) Build and Launch +### Build and Launch -We're now ready to build our Docker file. Navigate to your Omniperf install directory to begin. +We are now ready to build our Docker file. Navigate to your Omniperf install directory to begin. ```bash $ sudo docker-compose build $ sudo docker-compose up -d ``` > Note that TCP ports for Grafana (4000) and MongoDB (27017) in the docker container are mapped to 14000 and 27018, respectively, on the host side. -### 4) Setup Grafana Instance -Once you've launced your docker container you should be able to reach Grafana at **http://\:14000**. The default login credentials for the first-time Grafana setup are: +### Setup Grafana Instance +Once you have launched your docker container you should be able to reach Grafana at **http://\:14000**. The default login credentials for the first-time Grafana setup are: - Username: **admin** - Password: **admin** ![Grafana Welcome Page](images/grafana_welcome.png) -MongoDB Datasource Configuration +### MongoDB Datasource Configuration The MongoDB Datasource must be configured prior to the first-time use. Navigate to Grafana's Configuration page (shown below) to add the **Omniperf Data** connection. @@ -219,7 +219,7 @@ After properly configuring these fields click **Save & Test** (as shown below) t ![Datasource Settings](images/datasource_settings.jpg) -Omniperf Dashboard Import +### Omniperf Dashboard Import From *Create* → *Import*, (as shown below) upload the dashboard file, `/dashboards/Omniperf_v{__VERSION__}_pub.json`, from the Omniperf tarball. @@ -227,17 +227,17 @@ Edit both the Dashboard Name and the Unique Identifier (UID) to uniquely identif ![Import Dashboard](images/import_dashboard.png) -Using your dashboard +### Using your dashboard -Once you've imported a dashboard you're ready to begin! Start by browsing availible dashboards and selecting the dashboard you've just imported. +Once you have imported a dashboard you are ready to begin! Start by browsing available dashboards and selecting the dashboard you have just imported. ![Opening your dashboard](images/opening_dashboard.png) -Remeber, you'll need to upload workload data to the DB backend before analyzing in your Grafana interface. We provide a detailed example of this in our [Analysis section](./analysis.md#grafana-gui-import). +Remeber, you will need to upload workload data to the DB backend before analyzing in your Grafana interface. We provide a detailed example of this in our [Analysis section](./analysis.md#grafana-gui-import). After a workload has been successfully uploaded, you should be able to select it from the workload dropdown located at the top of your Grafana dashboard. ![Selecting Grafana workload](images/grafana_workload_selection.png) -For more information on how to use the Grafana interface for anlysis please see the [Grafana section](./analysis.md#grafana-based-gui) in the Analyze Mode tab. +For more information on how to use the Grafana interface for analysis please see the [Grafana section](./analysis.md#grafana-based-gui) in the Analyze Mode tab. diff --git a/src/docs/introduction.md b/src/docs/introduction.md index f0e3864d1..6e595b926 100644 --- a/src/docs/introduction.md +++ b/src/docs/introduction.md @@ -6,15 +6,19 @@ :maxdepth: 4 ``` +This documentation was created to provide a detailed breakdown of all facets of Omniperf. In addition to a full deployment guide with installation instructions, we also explain the design of the tool and each of its components. If you are new to Omniperf, these chapters can be followed in order to gradually acquaint you with the tool and progressively introduce its more advanced features. + +This project is proudly open source, and we welcome all feedback! For more details on how to contribute, please see our Contribution Guide. + [Browse Omniperf source code on Github](https://github.com/AMDResearch/omniperf) -## Scope +## What is Omniperf -MI Performance Profiler ([Omniperf](https://github.com/AMDResearch/omniperf)) is a system performance profiling tool for Machine Learning/HPC workloads running on AMD Instinct (tm) Accelerators. It is currently built on top of the [rocProfiler](https://rocm.docs.amd.com/projects/rocprofiler/en/latest/rocprof.html) to monitor hardware performance counters. The Omniperf tool primarily targets accelerators in the MI100 and MI200 families. Development is in progress to support MI300 and Radeon (tm) RDNA (tm) GPUs. +Omniperf is a kernel level profiling tool for Machine Learning/HPC workloads running on AMD Instinct (tm) MI accelerators. AMD's Instinct (tm) MI accelerators are Data Center GPUs designed for compute and with some graphics functions disabled or removed. Omniperf is currently built on top of [rocProf](https://rocm.docs.amd.com/projects/rocprofiler/en/latest/rocprof.html) to monitor hardware performance counters. The Omniperf tool primarily targets accelerators in the MI100 and MI200 families. Development is in progress to support AMD Instinct (tm) MI300 and Radeon (tm) RDNA (tm) GPUs. ## Features -The Omniperf tool performs system profiling based on all available hardware counters for the target accelerator. It provides high level performance analysis features including System Speed-of-Light, IP block Speed-of-Light, Memory Chart Analysis, Roofline Analysis, Baseline Comparisons, and more... +The Omniperf tool performs profiling based on all available hardware counters for the target accelerator. It provides high level performance analysis features including System Speed-of-Light, Hardware block level Speed-of-Light, Memory Chart Analysis, Roofline Analysis, Baseline Comparisons, and more... Both command line analysis and GUI analysis are supported. @@ -32,25 +36,25 @@ Detailed Feature List: - System Speed-of-Light Panel - Kernel Statistic Panel - Memory Chart Analysis Panel -- Roofline Analysis Panel (*Supported on MI200 only, SLES 15 SP3 or RHEL8*) +- Roofline Analysis Panel (*Supported on MI200 only, Ubuntu 20.04, SLES 15 SP3 or RHEL8*) - Command Processor (CP) Panel -- Shader Processing Input (SPI) Panel +- Workgroup Manager (SPI) Panel - Wavefront Launch Panel - Compute Unit - Instruction Mix Panel - Compute Unit - Pipeline Panel - Local Data Share (LDS) Panel - Instruction Cache Panel - Scalar L1D Cache Panel -- Texture Addresser and Data Panel +- L1 Address Processing Unit, a.k.a. Texture Addresser (TA) / L1 Backend Data Processing Unit, a.k.a. Texture Data (TD) panel(s) - Vector L1D Cache Panel - L2 Cache Panel - L2 Cache (per-Channel) Panel -## Compatible SOCs +## Compatible SoCs | Platform | Status | | :------- | :------------- | -| Vega 20 (MI-50/60) | No | +| Vega 20 (MI50/60) | No | | MI100 | Supported | | MI200 | Supported | | MI300 | In development | diff --git a/src/docs/performance_model.md b/src/docs/performance_model.md new file mode 100644 index 000000000..59b685144 --- /dev/null +++ b/src/docs/performance_model.md @@ -0,0 +1,4408 @@ +# AMD Instinct(tm) MI Series Accelerator Performance Model + +```eval_rst +.. sectionauthor:: Nicholas Curtis +``` + +Omniperf makes available an extensive list of metrics to better understand achieved application performance on AMD Instinct(tm) MI accelerators including Graphics Core Next (GCN) GPUs such as the AMD Instinct MI50, CDNA(tm) accelerators such as the MI100, and CDNA(tm) 2 accelerators such as MI250X/250/210. + +To best utilize this profiling data, it is vital to understand the role of various hardware blocks of AMD Instinct accelerators. This section aims to describe each hardware block on the accelerator as interacted with by a software developer, and give a deeper understanding of the metrics reported therein. Refer to [Profiling with Omniperf by Example](profiling-with-omniperf) for more practical examples and detail on how to use Omniperf to optimize your code. + +(2xxnote)= +```{note} +In this document, we use `MI2XX` to refer to any of the AMD Instinct(tm) MI250X, MI250, and MI210 CDNA2 accelerators interchangeably for situations where the exact product in question is not relevant. +For more details on the differences between these accelerators, we refer the reader to the [MI250X](https://www.amd.com/en/products/server-accelerators/instinct-mi250x), [MI250](https://www.amd.com/en/products/server-accelerators/instinct-mi250) and [MI210](https://www.amd.com/en/products/server-accelerators/amd-instinct-mi210) product pages. +``` + + +(CU)= +## Compute Unit (CU) + +The Compute Unit (CU) is responsible for executing a user's kernels on AMD's CDNA(tm) accelerators. All [wavefronts](wavefront) of a [workgroup](workgroup) are scheduled on the same CU. + +![GCN Compute Unit](images/gcn_compute_unit.png) + +The CU consists of several independent pipelines / functional units: + +- The vector arithmetic logic unit (VALU) is composed of multiple Single Instruction Multiple Data (SIMD) vector processors, Vector General Purpose Registers (VGPRs) and instruction buffers. The VALU is responsible for executing much of the computational work on CDNA accelerators, including (but not limited to) floating-point operations (FLOPs), integer operations (IOPs), etc. +- The vector memory (VMEM) unit is responsible for issuing loads, stores and atomic operations that interact with the memory system. +- The Scalar Arithmetic Logic Unit (SALU) is shared by all threads in a [wavefront](wavefront), and is responsible for executing instructions that are known to be uniform across the wavefront at compile-time. The SALU has a memory unit (SMEM) for interacting with memory, but it cannot issue separately from the SALU. +- The Local Data Share (LDS) is an on-CU software-managed scratchpad memory that can be used to efficiently share data between all threads in a [workgroup](workgroup). +- The scheduler is responsible for issuing and decoding instructions for all the [wavefronts](wavefront) on the compute unit. +- The vector L1 data cache (vL1D) is the first level cache local to the compute unit. On current CDNA accelerators, the vL1D is write-through. The vL1D caches from multiple compute units are kept coherent with one another through software instructions. +- CDNA accelerators --- i.e., the MI100 and newer --- contain specialized matrix-multiplication accelerator pipelines known as the [Matrix Fused Multiply-Add (MFMA)](mfma). + +For a more thorough description of a compute unit on a CDNA accelerator, see [An introduction to AMD GPU +Programming with HIP](https://www.olcf.ornl.gov/wp-content/uploads/2019/09/AMD_GPU_HIP_training_20190906.pdf), specifically slides 22-28, and [Layla Mah's: The AMD GCN Architecture - A Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah), slide 27. + +The [Pipeline Descriptions section](ERD) details the various execution pipelines (VALU, SALU, LDS, Scheduler, etc.). +The metrics presented by Omniperf for these pipelines are described in [Pipeline Metrics section](ERM). +Finally, the [vL1D](vL1D) cache and [LDS](LDS) will be described their own sections. + + +(ERD)= +### Pipeline Descriptions + +(valu)= +#### Vector Arithmetic Logic Unit (VALU) + +The vector arithmetic logic unit (VALU) executes vector instructions over an entire wavefront, each [work-item](Workitem) (or, vector-lane) potentially operating on distinct data. +The VALU of a CDNA accelerator or GCN GPU typically consists of: + +- four 16-wide SIMD processors (see [An introduction to AMD GPU +Programming with HIP](https://www.olcf.ornl.gov/wp-content/uploads/2019/09/AMD_GPU_HIP_training_20190906.pdf) for more details) +- four 64 or 128 KiB VGPR files (yielding a total of 256-512 KiB total per CU), see [AGPRs](agprs) for more detail. +- An instruction buffer (per-SIMD) that contains execution slots for up to 8 wavefronts (for 32 total wavefront slots on each CU). +- A vector memory (VMEM) unit which transfers data between VGPRs and memory; each work-item supplies its own memory address and supplies or receives unique data. +- CDNA accelerators, such as the MI100 and [MI2XX](2xxnote), contain additional [Matrix Fused Multiply-Add (MFMA) units](https://gpuopen.com/learn/amd-lab-notes/amd-lab-notes-matrix-cores-readme/). + +In order to support branching / conditionals, each wavefront in the VALU has a distinct execution mask which determines which work-items in the wavefront are active for the currently executing instruction. +When executing a VALU instruction, inactive work-items (according to the current execution mask of the wavefront) do not execute the instruction and are treated as no-ops. + +```{note} +On GCN GPUs and the CDNA MI100 accelerator, there are slots for up to 10 wavefronts in the instruction buffer, but generally occupancy is limited by other factors to 32 waves per [Compute Unit](CU). +On the CDNA2 [MI2XX](2xxnote) series accelerators, there are only 8 waveslots per-SIMD. +``` + +(salu)= +#### Scalar Arithmetic Logic Unit (SALU) + +The scalar arithmetic logic unit (SALU) executes instructions that are shared between all work-items in a wavefront. This includes control-flow -- such as if/else conditionals, branches and looping -- pointer arithmetic, loading common values, etc. +The SALU consists of: + +- a scalar processor capable of various arithmetic, conditional, and comparison (etc.) operations. See, e.g., [Chapter 5. Scalar ALU Operations](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf) of the CDNA2 Instruction Set Architecture (ISA) Guide for more detail. +- a 12.5 KiB Scalar General Purpose Register (SGPR) file +- a scalar memory (SMEM) unit which transfers data between SGPRs and memory + +Data loaded by the SMEM can be cached in the [scalar L1 data cache](sL1D), and is typically only used for read-only, uniform accesses such as kernel arguments, or HIP's `__constant__` memory. + +(lds)= +#### Local Data Share (LDS) + +The local data share (LDS, a.k.a., "shared memory") is fast on-CU scratchpad that can be explicitly managed by software to effectively share data and to coordinate between wavefronts in a workgroup. + +```{figure} images/lds.* +:scale: 150 % +:alt: Performance model of the Local Data Share (LDS) on AMD Instinct(tm) MI accelerators. +:align: center + +Performance model of the Local Data Share (LDS) on AMD Instinct(tm) MI accelerators. +``` + +Above is Omniperf's performance model of the LDS on CDNA accelerators (adapted from [GCN Architecture, by Mike Mantor](https://old.hotchips.org/wp-content/uploads/hc_archives/hc24/HC24-3-ManyCore/HC24.28.315-AMD.GCN.mantor_v1.pdf), slide 20). +The SIMDs in the [VALU](valu) are connected to the LDS in pairs (see above). +Only one SIMD per pair may issue an LDS instruction at a time, but both pairs may issue concurrently. + +On CDNA accelerators, the LDS contains 32 banks and each bank is 4B wide. +The LDS is designed such that each bank can be read from/written to/atomically updated every cycle, for a total throughput of 128B/clock ([GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah), slide 40). + +On each of the two ports to the SIMDs, 64B can be sent in each direction per cycle. So, a single wavefront, coming from one of the 2 SIMDs in a pair, can only get back 64B/cycle (16 lanes per cycle). The input port is shared between data and address and this can affect achieved bandwidth for different data sizes. For example, a 64-wide store where each lane is sending a 4B value takes 8 cycles (50% peak bandwidth) while a 64-wide store where each lane is sending a 16B value takes 20 cycles (80% peak bandwidth). + +In addition, the LDS contains conflict-resolution hardware to detect and handle bank conflicts. +A bank conflict occurs when two (or more) work-items in a wavefront want to read, write, or atomically update different addresses that map to the same bank in the same cycle. +In this case, the conflict detection hardware will determine a new schedule such that the access is split into multiple cycles with no conflicts in any single cycle. + +When multiple work-items want to read from the same address within a bank, the result can be efficiently broadcasted ([GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah), slide 41). +Multiple work-items writing to the same address within a bank typically results undefined behavior in HIP and other languages, as the LDS will write the value from the last work-item as determined by the hardware scheduler ([GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah), slide 41). This behavior may be useful in the very specific case of storing a uniform value. + +Relatedly, an address conflict is defined as occurring when two (or more) work-items in a wavefront want to atomically update the same address on the same cycle. +As in a bank-conflict, this may cause additional cycles of work for the LDS operation to complete. + +(branch)= +#### Branch + +The branch unit is responsible for executing jumps and branches to execute control-flow operations. +Note that Branch operations are not used for execution mask updates, but only for “whole wavefront” control-flow changes. + +(scheduler)= +#### Scheduler + +The scheduler is responsible for arbitration and issue of instructions for all the wavefronts currently executing on the CU. On every clock cycle, the scheduler: + +- considers waves from one of the SIMD units for execution, selected in a round-robin fashion between the SIMDs in the [compute unit](CU) +- issues up to one instruction per wavefront on the selected SIMD +- issues up to one instruction per each of the instruction categories among the waves on the selected SIMD: + - [VALU](valu) + - [VMEM](valu) operations + - [SALU](salu) / SMEM operations + - [LDS](lds) + - [Branch](branch) operations + +This gives a maximum of five issued Instructions Per Cycle (IPC), per-SIMD, per-CU ([AMD GPU HIP Training](https://www.olcf.ornl.gov/wp-content/uploads/2019/09/AMD_GPU_HIP_training_20190906.pdf), [GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah)). + +On CDNA accelerators with [MFMA](mfma) instructions, these are issued via the [VALU](valu). Some of them will execute on a separate functional unit and typically allow other [VALU](valu) operations to execute in their shadow (see the [MFMA](mfma) section for more detail). + +```{note} +The IPC model used by Omniperf omits the following two complications for clarity. +First, CDNA accelerators contain other execution units on the CU that are unused for compute applications. +Second, so-called "internal" instructions (see [Layla Mah's GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah), slide 29) are not issued to a functional unit, and can technically cause the maximum IPC to _exceed_ 5 instructions per-cycle in special (largely unrealistic) cases. +The latter issue is discussed in more detail in our ['internal' IPC](Internal_ipc) example. +``` + +(mfma)= +#### Matrix Fused Multiply-Add (MFMA) + +CDNA accelerators, such as the MI100 and [MI2XX](2xxnote), contain specialized hardware to accelerate matrix-matrix multiplications, also known as Matrix Fused Multiply-Add (MFMA) operations. +The exact operation types and supported formats may vary by accelerator. +The reader is referred to the [AMD matrix cores](https://gpuopen.com/learn/amd-lab-notes/amd-lab-notes-matrix-cores-readme/) blog post on GPUOpen for a general discussion of these hardware units. +In addition, to explore the available MFMA instructions in-depth on various AMD accelerators (including the CDNA line), we recommend the [AMD Matrix Instruction Calculator](https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator). + +```{code-block} shell-session +:name: matrix_calc_ex +:caption: Partial snapshot of the AMD Matrix Instruction Calculator Tool + +$ ./matrix_calculator.py --architecture cdna2 --instruction v_mfma_f32_4x4x1f32 --detail-instruction +Architecture: CDNA2 +Instruction: V_MFMA_F32_4X4X1F32 + Encoding: VOP3P-MAI + VOP3P Opcode: 0x42 + VOP3P-MAI Opcode: 0x2 + Matrix Dimensions: + M: 4 + N: 4 + K: 1 + blocks: 16 + Execution statistics: + FLOPs: 512 + Execution cycles: 8 + FLOPs/CU/cycle: 256 + Can co-execute with VALU: True + VALU co-execution cycles possible: 4 + Register usage: + GPRs required for A: 1 + GPRs required for B: 1 + GPRs required for C: 4 + GPRs required for D: 4 + GPR alignment requirement: 8 bytes +``` + +For the purposes of Omniperf, the MFMA unit is typically treated as a separate pipeline from the [VALU](valu), as other VALU instructions (along with other execution pipelines such as the SALU) can be issued during a portion of the total duration of an MFMA operation. + +```{note} +The exact details of VALU and MFMA operation co-execution vary by instruction, and can be explored in more detail via the: + - 'Can co-execute with VALU' + - 'VALU co-execution cycles possible' + +fields in the [AMD Matrix Instruction Calculator](https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator#example-of-querying-instruction-information)'s detailed instruction information. +``` + +#### Non-pipeline resources + +In this section, we describe a few resources that are not standalone pipelines but are important for understanding performance optimization on CDNA accelerators. + +(barrier)= +##### Barrier + +Barriers are resources on the compute-unit of a CDNA accelerator that are used to implement synchronization primitives (e.g., HIP's `__syncthreads`). +Barriers are allocated to any workgroup that consists of more than a single wavefront. + +(agprs)= +##### Accumulation vector General-Purpose Registers (AGPRs) + +Accumulation vector General-Purpose Registers, or AGPRs, are special resources that are accessible to a subset of instructions focused on [MFMA](mfma) operations. +These registers allow the [MFMA](mfma) unit to access more than the normal maximum of 256 [architected Vector General-Purpose Registers (i.e., VGPRs)](valu) by having up to 256 in the architected space and up to 256 in the accumulation space. +Traditional VALU instructions can only use VGPRs in the architected space, and data can be moved to/from VGPRs↔AGPRs using specialized instructions (`v_accvgpr_*`). +These data movement instructions may be used by the compiler to implement lower-cost register-spill/fills on architectures with AGPRs. + +AGPRs are not available on all AMD Instinct(tm) accelerators. +GCN GPUs, such as the AMD Instinct(tm) MI50 had a 256 KiB VGPR file. +The AMD Instinct(tm) MI100 (CDNA) has a 2x256 KiB register file, where one half is available as general-purpose VGPRs, and the other half is for matrix math accumulation VGPRs (AGPRs). +The AMD Instinct(tm) [MI2XX](2xxnote) (CDNA2) has a 512 KiB VGPR file per CU, where each wave can dynamically request up to 256 KiB of VGPRs and an additional 256 KiB of AGPRs. +For more detail, the reader is referred to the [following comment](https://github.com/RadeonOpenCompute/ROCm/issues/1689#issuecomment-1553751913). + +(ERM)= +### Pipeline Metrics + +In this section, we describe the metrics available in Omniperf to analyze the pipelines discussed in the [previous section](ERD). + +#### Wavefront + +(Wavefront_launch_stats)= +##### Wavefront Launch Stats + +The wavefront launch stats panel gives general information about the kernel launch: + +```{list-table} +:header-rows: 1 +:widths: 20 65 15 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Grid Size + - The total number of work-items (a.k.a "threads") launched as a part of the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied by the total workgroup (a.k.a "block") size. + - [Work-items](Workitem) +* - Workgroup Size + - The total number of work-items (a.k.a "threads") in each workgroup (a.k.a "block") launched as part of the kernel dispatch. In HIP, this is equivalent to the total block size. + - [Work-items](Workitem) +* - Total Wavefronts + - The total number of wavefronts launched as part of the kernel dispatch. On AMD Instinct(tm) CDNA accelerators and GCN GPUs, the wavefront size is always 64 work-items. Thus, the total number of wavefronts should be equivalent to the ceiling of Grid Size divided by 64. + - [Wavefronts](Wavefront) +* - Saved Wavefronts + - The total number of wavefronts saved at a context-save, see [cwsr_enable](https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr). + - [Wavefronts](Wavefront) +* - Restored Wavefronts + - The total number of wavefronts restored from a context-save, see [cwsr_enable](https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr). + - [Wavefronts](Wavefront) +* - VGPRs + - The number of architected vector general-purpose registers allocated for the kernel, see [VALU](valu). Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. + - [VGPRs](valu) +* - AGPRs + - The number of accumulation vector general-purpose registers allocated for the kernel, see [AGPRs](agprs). Note: this may not exactly match the number of AGPRs requested by the compiler due to allocation granularity. + - [AGPRs](agprs) +* - SGPRs + - The number of scalar general-purpose registers allocated for the kernel, see [SALU](salu). Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. + - [SGPRs](salu) +* - LDS Allocation + - The number of bytes of [LDS](lds) memory (a.k.a., "Shared" memory) allocated for this kernel. Note: This may also be larger than what was requested at compile-time due to both allocation granularity and dynamic per-dispatch LDS allocations. + - Bytes per [workgroup](workgroup) +* - Scratch Allocation + - The number of bytes of [scratch-memory](Mspace) requested _per_ work-item for this kernel. Scratch memory is used for stack memory on the accelerator, as well as for register spills/restores. + - Bytes per [work-item](workitem) +``` + +(Wavefront_runtime_stats)= +##### Wavefront Runtime Stats + +The wavefront runtime statistics gives a high-level overview of the execution of wavefronts in a kernel: + +```{list-table} +:header-rows: 1 +:widths: 18 65 17 +:class: noscroll-table +* - Metric + - Description + - Unit +* - [Kernel Time](KernelTime) + - The total duration of the executed kernel. Note: this should not be directly compared to the wavefront cycles / timings below. + - Nanoseconds +* - [Kernel Cycles](KernelCycles) + - The total duration of the executed kernel in cycles. Note: this should not be directly compared to the wavefront cycles / timings below. + - Cycles +* - Instructions per wavefront + - The average number of instructions (of all types) executed per wavefront. This is averaged over all wavefronts in a kernel dispatch. + - Instructions / wavefront +* - Wave Cycles + - The number of cycles a wavefront in the kernel dispatch spent resident on a compute unit per [normalization-unit](normunit). This is averaged over all wavefronts in a kernel dispatch. Note: this should not be directly compared to the kernel cycles above. + - Cycles per [normalization-unit](normunit) +* - Dependency Wait Cycles + - The number of cycles a wavefront in the kernel dispatch stalled waiting on memory of any kind (e.g., instruction fetch, vector or scalar memory, etc.) per [normalization-unit](normunit). This counter is incremented at every cycle by _all_ wavefronts on a CU stalled at a memory operation. As such, it is most useful to get a sense of how waves were spending their time, rather than identification of a precise limiter because another wave could be actively executing while a wave is stalled. The sum of this metric, Issue Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric. + - Cycles per [normalization-unit](normunit) +* - Issue Wait Cycles + - The number of cycles a wavefront in the kernel dispatch was unable to issue an instruction for any reason (e.g., execution pipe back-pressure, arbitration loss, etc.) per [normalization-unit](normunit). This counter is incremented at every cycle by _all_ wavefronts on a CU unable to issue an instruction. As such, it is most useful to get a sense of how waves were spending their time, rather than identification of a precise limiter because another wave could be actively executing while a wave is issue stalled. The sum of this metric, Dependency Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric. + - Cycles per [normalization-unit](normunit) +* - Active Cycles + - The average number of cycles a wavefront in the kernel dispatch was actively executing instructions per [normalization-unit](normunit). This measurement is made on a per-wavefront basis, and may include (e.g.,) cycles that another wavefront spent actively executing (e.g., on another execution unit) or was stalled. As such, it is most useful to get a sense of how waves were spending their time, rather than identification of a precise limiter. The sum of this metric, Issue Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles metric. + - Cycles per [normalization-unit](normunit) +* - Wavefront Occupancy + - The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (<< 1ms). + - Wavefronts +``` + +```{seealso} +As mentioned above, the measurement of kernel cycles and time typically cannot directly be compared to e.g., Wave Cycles. +This is due to two factors: first, the kernel cycles/timings are measured using a counter that is impacted by scheduling overhead, this is particularly noticeable for "short-running" kernels (typically << 1ms) where scheduling overhead forms a significant portion of the overall kernel runtime. +Secondly, the Wave Cycles metric is incremented per-wavefront scheduled to a SIMD every cycle whereas the kernel cycles counter is incremented only once per-cycle when _any_ wavefront is scheduled. +``` + +(Inst_mix)= +#### Instruction Mix + +The instruction mix panel shows a breakdown of the various types of instructions executed by the user's kernel, and which pipelines on the [CU](CU) they were executed on. +In addition, Omniperf reports further information about the breakdown of operation types for the [VALU](valu), vector-memory, and [MFMA](mfma) instructions. + +```{note} +All metrics in this section count _instructions issued_, and _not_ the total number of operations executed. +The values reported by these metrics will not change regardless of the execution mask of the wavefront. +We note that even if the execution mask is identically zero (i.e., _no lanes are active_) the instruction will still be counted, as CDNA accelerators still consider these instructions 'issued' see, e.g., [EXECute Mask, Section 3.3 of the CDNA2 ISA Guide](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf) for more details. +``` + +##### Overall Instruction Mix + +This panel shows the total number of each type of instruction issued to the [various compute pipelines](ERD) on the [CU](CU). +These are: + +```{list-table} +:header-rows: 1 +:widths: 20 65 15 +:class: noscroll-table +* - Metric + - Description + - Unit +* - [VALU](valu) Instructions + - The total number of vector arithmetic logic unit (VALU) operations issued. These are the workhorses of the compute-unit, and are used to execute wide range of instruction types including floating point operations, non-uniform address calculations, transcendental operations, integer operations, shifts, conditional evaluation, etc. + - Instructions +* - VMEM Instructions + - The total number of vector memory operations issued. These include most loads, stores and atomic operations and all accesses to [generic, global, private and texture](Mspace) memory. + - Instructions +* - [LDS](lds) Instructions + - The total number of LDS (a.k.a., "shared memory") operations issued. These include (e.g.,) loads, stores, atomics, and HIP's `__shfl` operations. + - Instructions +* - [MFMA](mfma) Instructions + - The total number of matrix fused multiply-add instructions issued. + - Instructions +* - [SALU](salu) Instructions + - The total number of scalar arithmetic logic unit (SALU) operations issued. Typically these are used for (e.g.,) address calculations, literal constants, and other operations that are _provably_ uniform across a wavefront. Although scalar memory (SMEM) operations are issued by the SALU, they are counted separately in this section. + - Instructions +* - SMEM Instructions + - The total number of scalar memory (SMEM) operations issued. These are typically used for loading kernel arguments, base-pointers and loads from HIP's `__constant__` memory. + - Instructions +* - [Branch](branch) Instructions + - The total number of branch operations issued. These typically consist of jump / branch operations and are used to implement control flow. + - Instructions +``` + +```{note} +Note, as mentioned in the [Branch](branch) section: branch operations are not used for execution mask updates, but only for "whole wavefront" control-flow changes. +``` + +(VALU_Inst_Mix)= +##### VALU Arithmetic Instruction Mix +```{warning} +Not all metrics in this section (e.g., the floating-point instruction breakdowns) are available on CDNA accelerators older than the [MI2XX](2xxnote) series. +``` + +This panel details the various types of vector instructions that were issued to the [VALU](valu). +The metrics in this section do _not_ include [MFMA](mfma) instructions using the same precision, e.g. the "F16-ADD" metric does not include any 16-bit floating point additions executed as part of an MFMA instruction using the same precision. + +```{list-table} +:header-rows: 1 +:widths: 15 65 20 +:class: noscroll-table +* - Metric + - Description + - Unit +* - INT32 + - The total number of instructions operating on 32-bit integer operands issued to the VALU per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - INT64 + - The total number of instructions operating on 64-bit integer operands issued to the VALU per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - F16-ADD + - The total number of addition instructions operating on 16-bit floating-point operands issued to the VALU per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - F16-MUL + - The total number of multiplication instructions operating on 16-bit floating-point operands issued to the VALU per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - F16-FMA + - The total number of fused multiply-add instructions operating on 16-bit floating-point operands issued to the VALU per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - F16-TRANS + - The total number of transcendental instructions (e.g., `sqrt`) operating on 16-bit floating-point operands issued to the VALU per [normalization-unit](normunit) + - Instructions per [normalization-unit](normunit) +* - F32-ADD + - The total number of addition instructions operating on 32-bit floating-point operands issued to the VALU per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - F32-MUL + - The total number of multiplication instructions operating on 32-bit floating-point operands issued to the VALU per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - F32-FMA + - The total number of fused multiply-add instructions operating on 32-bit floating-point operands issued to the VALU per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - F32-TRANS + - The total number of transcendental instructions (e.g., `sqrt`) operating on 32-bit floating-point operands issued to the VALU per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - F64-ADD + - The total number of addition instructions operating on 64-bit floating-point operands issued to the VALU per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - F64-MUL + - The total number of multiplication instructions operating on 64-bit floating-point operands issued to the VALU per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - F64-FMA + - The total number of fused multiply-add instructions operating on 64-bit floating-point operands issued to the VALU per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - F64-TRANS + - The total number of transcendental instructions (e.g., `sqrt`) operating on 64-bit floating-point operands issued to the VALUper [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - Conversion + - The total number of type conversion instructions (e.g., converting data to/from F32↔F64) issued to the VALU per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +``` + +For an example of these counters in action, the reader is referred to the [VALU Arithmetic Instruction Mix example](VALU_inst_mix_example). + +##### VMEM Instruction Mix + +This section breaks down the types of vector memory (VMEM) instructions that were issued. +Refer to the [Instruction Counts metrics section](TA_inst) of address-processor frontend of the vL1D cache for a description of these VMEM instructions. + +(MFMA_Inst_mix)= +##### MFMA Instruction Mix + +```{warning} +The metrics in this section are only available on CDNA2 ([MI2XX](2xxnote)) accelerators and newer. +``` + +This section details the types of Matrix Fused Multiply-Add ([MFMA](mfma)) instructions that were issued. +Note that [MFMA](mfma) instructions are classified by the type of input data they operate on, and _not_ the data-type the result is accumulated to. + +```{list-table} +:header-rows: 1 +:widths: 25 60 17 +:class: noscroll-table +* - Metric + - Description + - Unit +* - MFMA-I8 Instructions + - The total number of 8-bit integer [MFMA](mfma) instructions issued per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - MFMA-F16 Instructions + - The total number of 16-bit floating point [MFMA](mfma) instructions issued per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - MFMA-BF16 Instructions + - The total number of 16-bit brain floating point [MFMA](mfma) instructions issued per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - MFMA-F32 Instructions + - The total number of 32-bit floating-point [MFMA](mfma) instructions issued per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - MFMA-F64 Instructions + - The total number of 64-bit floating-point [MFMA](mfma) instructions issued per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +``` + +#### Compute Pipeline + +(FLOP_count)= +##### FLOP counting conventions + +Omniperf's conventions for VALU FLOP counting are as follows: + - Addition or Multiplication: 1 operation + - Transcendentals: 1 operation + - Fused Multiply-Add (FMA): 2 operations + +Integer operations (IOPs) do not use this convention. They are counted as a single operation regardless of the instruction type. + +```{note} +Packed operations which operate on multiple operands in the same instruction are counted identically to the underlying instruction type. +For example, the `v_pk_add_f32` instruction on [MI2XX](2xxnote), which performs an add operation on two pairs of aligned 32-bit floating-point operands is counted only as a single addition (i.e., 1 operation). +``` + +As discussed in the [Instruction Mix](Inst_Mix) section, the FLOP/IOP metrics in this section do not take into account the execution mask of the operation, and will report the same value even if the execution mask is identically zero. + +For example, a FMA instruction operating on 32-bit floating-point operands (e.g., `v_fma_f32` on a [MI2XX](2xxnote) accelerator) would be counted as 128 total FLOPs: 2 operations (due to the instruction type) multiplied by 64 operations (because the wavefront is composed of 64 work-items). + +(Compute_SOL)= +##### Compute Speed-of-Light + +```{warning} +The theoretical maximum throughput for some metrics in this section are currently computed with the maximum achievable clock frequency, as reported by `rocminfo`, for an accelerator. This may not be realistic for all workloads. +``` + +This section reports the number of floating-point and integer operations executed on the [VALU](valu) and [MFMA](mfma) units in various precisions. +We note that unlike the [VALU instruction mix](VALU_Inst_Mix) and [MFMA instruction mix](MFMA_Inst_mix) sections, the metrics here are reported as FLOPs and IOPs, i.e., the total number of operations executed. + +```{list-table} +:header-rows: 1 +:widths: 20 65 15 +:class: noscroll-table +* - Metric + - Description + - Unit +* - VALU FLOPs + - The total floating-point operations executed per second on the [VALU](valu). This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from [MFMA](mfma) instructions. + - GFLOPs +* - VALU IOPs + - The total integer operations executed per second on the [VALU](valu). This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from [MFMA](mfma) instructions. + - GIOPs +* - MFMA FLOPs (BF16) + - The total number of 16-bit brain floating point [MFMA](mfma) operations executed per second. Note: this does not include any 16-bit brain floating point operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the specific accelerator. + - GFLOPs +* - MFMA FLOPs (F16) + - The total number of 16-bit floating point [MFMA](mfma) operations executed per second. Note: this does not include any 16-bit floating point operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific accelerator. + - GFLOPs +* - MFMA FLOPs (F32) + - The total number of 32-bit floating point [MFMA](mfma) operations executed per second. Note: this does not include any 32-bit floating point operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific accelerator. + - GFLOPs +* - MFMA FLOPs (F64) + - The total number of 64-bit floating point [MFMA](mfma) operations executed per second. Note: this does not include any 64-bit floating point operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator. + - GFLOPs +* - MFMA IOPs (INT8) + - The total number of 8-bit integer [MFMA](mfma) operations executed per second. Note: this does not include any 8-bit integer operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator. + - GIOPs +``` + + +(Pipeline_stats)= +##### Pipeline Statistics + +This section reports a number of key performance characteristics of various execution units on the [CU](cu). +The reader is referred to the [Instructions per-cycle and Utilizations](IPC_example) example for a detailed dive into these metrics, and the [scheduler](scheduler) for a high-level overview of execution units and instruction issue. + +```{list-table} +:header-rows: 1 +:widths: 20 65 15 +:class: noscroll-table +* - Metric + - Description + - Unit +* - IPC + - The ratio of the total number of instructions executed on the [CU](cu) over the [total active CU cycles](TotalActiveCUCycles). + - Instructions per-cycle +* - IPC (Issued) + - The ratio of the total number of (non-[internal](Internal_ipc)) instructions issued over the number of cycles where the [scheduler](scheduler) was actively working on issuing instructions. The reader is recommended the [Issued IPC](Issued_ipc) example for further detail. + - Instructions per-cycle +* - SALU Utilization + - Indicates what percent of the kernel's duration the [SALU](salu) was busy executing instructions. Computed as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [SALU](salu) / [SMEM](salu) instructions over the [total CU cycles](TotalCUCycles). + - Percent +* - VALU Utilization + - Indicates what percent of the kernel's duration the [VALU](valu) was busy executing instructions. Does not include [VMEM](valu) operations. Computed as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [VALU](valu) instructions over the [total CU cycles](TotalCUCycles). + - Percent +* - VMEM Utilization + - Indicates what percent of the kernel's duration the [VMEM](valu) unit was busy executing instructions, including both global/generic and spill/scratch operations (see the [VMEM instruction count metrics](TA_inst) for more detail). Does not include [VALU](valu) operations. Computed as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [VMEM](valu) instructions over the [total CU cycles](TotalCUCycles). + - Percent +* - Branch Utilization + - Indicates what percent of the kernel's duration the [Branch](branch) unit was busy executing instructions. Computed as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [Branch](branch) instructions over the [total CU cycles](TotalCUCycles). + - Percent +* - VALU Active Threads + - Indicates the average level of [divergence](Divergence) within a wavefront over the lifetime of the kernel. The number of work-items that were active in a wavefront during execution of each [VALU](valu) instruction, time-averaged over all VALU instructions run on all wavefronts in the kernel. + - Work-items +* - MFMA Utilization + - Indicates what percent of the kernel's duration the [MFMA](mfma) unit was busy executing instructions. Computed as the ratio of the total number of cycles spent by the [MFMA](salu) was busy over the [total CU cycles](TotalCUCycles). + - Percent +* - MFMA Instruction Cycles + - The average duration of [MFMA](mfma) instructions in this kernel in cycles. Computed as the ratio of the total number of cycles the [MFMA](mfma) unit was busy over the total number of [MFMA](mfma) instructions. Compare to e.g., the [AMD Matrix Instruction Calculator](https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator). + - Cycles per instruction +* - VMEM Latency + - The average number of round-trip cycles (i.e., from issue to data-return / acknowledgment) required for a VMEM instruction to complete. + - Cycles +* - SMEM Latency + - The average number of round-trip cycles (i.e., from issue to data-return / acknowledgment) required for a SMEM instruction to complete. + - Cycles +``` + +```{note} +The Branch utilization reported in this section also includes time spent in other instruction types (namely: `s_endpgm`) that are _typically_ a very small percentage of the overall kernel execution. This complication is omitted for simplicity, but may result in small amounts of "branch" utilization (<<1\%) for otherwise branch-less kernels. +``` + +(FLOPS)= +##### Arithmetic Operations + +This section reports the total number of floating-point and integer operations executed in various precisions. +Unlike the [Compute speed-of-light](Compute_SOL) panel, this section reports both [VALU](valu) and [MFMA](mfma) operations of the same precision (e.g., F32) in the same metric. +Additionally, this panel lets the user control how the data is normalized (i.e., control the [normalization-unit](normunit)), while the speed-of-light panel does not. +For more detail on how operations are counted see the [FLOP counting convention](FLOP_count) section. + +```{warning} +As discussed in the [Instruction Mix](Inst_Mix) section, the metrics in this section do not take into account the execution mask of the operation, and will report the same value even if EXEC is identically zero. +``` + +```{list-table} +:header-rows: 1 +:widths: 18 65 17 +:class: noscroll-table +* - Metric + - Description + - Unit +* - FLOPs (Total) + - The total number of floating-point operations executed on either the [VALU](valu) or [MFMA](mfma) units, per [normalization-unit](normunit) + - FLOP per [normalization-unit](normunit) +* - IOPs (Total) + - The total number of integer operations executed on either the [VALU](valu) or [MFMA](mfma) units, per [normalization-unit](normunit) + - IOP per [normalization-unit](normunit) +* - F16 OPs + - The total number of 16-bit floating-point operations executed on either the [VALU](valu) or [MFMA](mfma) units, per [normalization-unit](normunit) + - FLOP per [normalization-unit](normunit) +* - BF16 OPs + - The total number of 16-bit brain floating-point operations executed on either the [VALU](valu) or [MFMA](mfma) units, per [normalization-unit](normunit). Note: on current CDNA accelerators, the [VALU](valu) has no native BF16 instructions. + - FLOP per [normalization-unit](normunit) +* - F32 OPs + - The total number of 32-bit floating-point operations executed on either the [VALU](valu) or [MFMA](mfma) units, per [normalization-unit](normunit) + - FLOP per [normalization-unit](normunit) +* - F64 OPs + - The total number of 64-bit floating-point operations executed on either the [VALU](valu) or [MFMA](mfma) units, per [normalization-unit](normunit) + - FLOP per [normalization-unit](normunit) +* - INT8 OPs + - The total number of 8-bit integer operations executed on either the [VALU](valu) or [MFMA](mfma) units, per [normalization-unit](normunit). Note: on current CDNA accelerators, the [VALU](valu) has no native INT8 instructions. + - IOPs per [normalization-unit](normunit) +``` + +(LDS_metrics)= +### Local Data Share (LDS) + +#### LDS Speed-of-Light + +```{warning} +The theoretical maximum throughput for some metrics in this section are currently computed with the maximum achievable clock frequency, as reported by `rocminfo`, for an accelerator. This may not be realistic for all workloads. +``` + +The LDS speed-of-light chart shows a number of key metrics for the [LDS](lds) as a comparison with the peak achievable values of those metrics. +The reader is referred to our previous [LDS](lds) description for a more in-depth view of the hardware. + +```{list-table} +:header-rows: 1 +:widths: 20 65 15 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Utilization + - Indicates what percent of the kernel's duration the [LDS](lds) was actively executing instructions (including, but not limited to, load, store, atomic and HIP's `__shfl` operations). Calculated as the ratio of the total number of cycles LDS was active over the [total CU cycles](TotalCUCycles). + - Percent +* - Access Rate + - Indicates the percentage of SIMDs in the [VALU](valu){sup}`1` actively issuing LDS instructions, averaged over the lifetime of the kernel. Calculated as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [LDS](lds) instructions over the [total CU cycles](TotalCUCycles). + - Percent +* - Theoretical Bandwidth (% of Peak) + - Indicates the maximum amount of bytes that _could_ have been loaded from/stored to/atomically updated in the LDS in this kernel, as a percent of the peak LDS bandwidth achievable. See the [LDS Bandwidth example](lds_bandwidth) for more detail. + - Percent +* - Bank Conflict Rate + - Indicates the percentage of active LDS cycles that were spent servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing bank conflicts over the number of LDS cycles that would have been required to move the same amount of data in an uncontended access.{sup}`2` + - Percent +``` + +```{note} +{sup}`1` Here we assume the typical case where the workload evenly distributes LDS operations over all SIMDs in a CU (that is, waves on different SIMDs are executing similar code). +For highly unbalanced workloads, where e.g., one SIMD pair in the CU does not issue LDS instructions at all, this metric is better interpreted as the percentage of SIMDs issuing LDS instructions on [SIMD pairs](lds) that are actively using the LDS, averaged over the lifetime of the kernel. + +{sup}`2` The maximum value of the bank conflict rate is less than 100% (specifically: 96.875%), as the first cycle in the [LDS scheduler](lds) is never considered contended. +``` + +#### Statistics + +The [LDS](lds) statistics panel gives a more detailed view of the hardware: + +```{list-table} +:header-rows: 1 +:widths: 18 65 17 +:class: noscroll-table +* - Metric + - Description + - Unit +* - LDS Instructions + - The total number of LDS instructions (including, but not limited to, read/write/atomics, and e.g., HIP's `__shfl` instructions) executed per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - Theoretical Bandwidth + - Indicates the maximum amount of bytes that could have been loaded from/stored to/atomically updated in the LDS per [normalization-unit](normunit). Does _not_ take into account the execution mask of the wavefront when the instruction was executed (see [LDS Bandwidth](lds_bandwidth) example for more detail). + - Bytes per [normalization-unit](normunit) +* - LDS Latency + - The average number of round-trip cycles (i.e., from issue to data-return / acknowledgment) required for an LDS instruction to complete. + - Cycles +* - Bank Conflicts/Access + - The ratio of the number of cycles spent in the [LDS scheduler](lds) due to bank conflicts (as determined by the conflict resolution hardware) to the base number of cycles that would be spent in the LDS scheduler in a completely uncontended case. This is the unnormalized form of the Bank Conflict Rate. + - Conflicts/Access +* - Index Accesses + - The total number of cycles spent in the [LDS scheduler](lds) over all operations per [normalization-unit](normunit). + - Cycles per [normalization-unit](normunit) +* - Atomic Return Cycles + - The total number of cycles spent on LDS atomics with return per [normalization-unit](normunit). + - Cycles per [normalization-unit](normunit) +* - Bank Conflicts + - The total number of cycles spent in the [LDS scheduler](lds) due to bank conflicts (as determined by the conflict resolution hardware) per [normalization-unit](normunit). + - Cycles per [normalization-unit](normunit) +* - Address Conflicts + - The total number of cycles spent in the [LDS scheduler](lds) due to address conflicts (as determined by the conflict resolution hardware) per [normalization-unit](normunit). + - Cycles per [normalization-unit](normunit) +* - Unaligned Stall + - The total number of cycles spent in the [LDS scheduler](lds) due to stalls from non-dword aligned addresses per [normalization-unit](normunit). + - Cycles per [normalization-unit](normunit) +* - Memory Violations + - The total number of out-of-bounds accesses made to the LDS, per [normalization-unit](normunit). This is unused and expected to be zero in most configurations for modern CDNA accelerators. + - Accesses per [normalization-unit](normunit) +``` + + +(vL1D)= +### Vector L1 Cache (vL1D) + +The vector L1 data (vL1D) cache is local to each [compute unit](CU) on the accelerator, and handles vector memory operations issued by a wavefront. +The vL1D cache consists of several components: + + - an address processing unit, also known as the [texture addresser (TA)](TA), which receives commands (e.g., instructions) and write/atomic data from the [Compute Unit](CU), and coalesces them into fewer requests for the cache to process. + - an address translation unit, also known as the L1 Unified Translation Cache (UTCL1), that translates requests from virtual to physical addresses for lookup in the cache. The translation unit has an L1 translation lookaside buffer (L1TLB) to reduce the cost of repeated translations. + - a Tag RAM that looks up whether a requested cache line is already present in the [cache](TC). + - the result of the Tag RAM lookup is placed in the L1 cache controller for routing to the correct location, e.g., the [L2 Memory Interface](TCP_TCC_Transactions_Detail) for misses or the [Cache RAM](TC) for hits. + - the Cache RAM, also known as the [texture cache (TC)](TC), stores requested data for potential reuse. Data returned from the [L2 cache](L2) is placed into the Cache RAM before going down the [data-return path](TD). + - a backend data processing unit, also known as the [texture data (TD)](TD) that routes data back to the requesting [Compute Unit](CU). + +Together, this complex is known as the vL1D, or Texture Cache per Pipe (TCP). +A simplified diagram of the vL1D is presented below: + +```{figure} images/l1perf_model.* +:scale: 150 % +:alt: Performance model of the vL1D Cache on AMD Instinct(tm) MI accelerators. +:align: center + +Performance model of the vL1D Cache on AMD Instinct(tm) MI accelerators. +``` + +(L1_SOL)= +#### vL1D Speed-of-Light + +```{warning} +The theoretical maximum throughput for some metrics in this section are currently computed with the maximum achievable clock frequency, as reported by `rocminfo`, for an accelerator. This may not be realistic for all workloads. +``` + +The vL1D's speed-of-light chart shows several key metrics for the vL1D as a comparison with the peak achievable values of those metrics. + +```{list-table} +:header-rows: 1 +:widths: 20 65 15 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Hit Rate + - The ratio of the number of vL1D cache line requests that hit{sup}`1` in vL1D cache over the total number of cache line requests to the [vL1D Cache RAM](TC). + - Percent +* - Bandwidth + - The number of bytes looked up in the vL1D cache as a result of [VMEM](VALU) instructions, as a percent of the peak theoretical bandwidth achievable on the specific accelerator. The number of bytes is calculated as the number of cache lines requested multiplied by the cache line size. This value does not consider partial requests, so e.g., if only a single value is requested in a cache line, the data movement will still be counted as a full cache line. + - Percent +* - Utilization + - Indicates how busy the [vL1D Cache RAM](TC) was during the kernel execution. The number of cycles where the [vL1D Cache RAM](TC) is actively processing any request divided by the number of cycles where the [vL1D is active](vL1d_activity){sup}`2` + - Percent +* - Coalescing + - Indicates how well memory instructions were coalesced by the [address processing unit](TA), ranging from uncoalesced (25\%) to fully coalesced (100\%). The average number of [thread-requests](ThreadRequests) generated per instruction divided by the ideal number of [thread-requests](ThreadRequests) per instruction. + - Percent +``` + +(vL1d_activity)= +```{note} +{sup}`1` The vL1D cache on AMD Instinct(tm) MI CDNA accelerators uses a "hit-on-miss" approach to reporting cache hits. +That is, if while satisfying a miss, another request comes in that would hit on the same pending cache line, the subsequent request will be counted as a 'hit'. +Therefore, it is also important to consider the Access Latency metric in the [Cache access metrics](TCP_cache_access_metrics) section when evaluating the vL1D hit rate. + +{sup}`2` Omniperf considers the vL1D to be active when any part of the vL1D (excluding the [address-processor](TA) and [data-return](TD) units) are active, e.g., performing a translation, waiting for data, accessing the Tag or Cache RAMs, etc. +``` +(TA)= +#### Address Processing Unit or Texture Addresser (TA) + +The [vL1D](vL1D)'s address processing unit receives vector memory instructions (commands) along with write/atomic data from a [Compute Unit](CU) and is responsible for coalescing these into requests for lookup in the [vL1D RAM](TC). +The address processor passes information about the commands (coalescing state, destination SIMD, etc.) to the [data processing unit](TD) for use after the requested data has been retrieved. + +Omniperf reports several metrics to indicate performance bottlenecks in the address processing unit, which are broken down into a few categories: + + - Busy / stall metrics + - Instruction counts + - Spill / Stack metrics + +##### Busy / Stall metrics + +When executing vector memory instructions, the compute unit must send an address (and in the case of writes/atomics, data) to the address processing unit. When the frontend cannot accept any more addresses, it must backpressure the wave-issue logic for the VMEM pipe and prevent the issue of a vector memory instruction until a previously issued memory operation has been processed. + +```{list-table} +:header-rows: 1 +:widths: 20 65 15 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Busy + - Percent of the [total CU cycles](TotalCUCycles) the address processor was busy + - Percent +* - Address Stall + - Percent of the [total CU cycles](TotalCUCycles) the address processor was stalled from sending address requests further into the vL1D pipeline + - Percent +* - Data Stall + - Percent of the [total CU cycles](TotalCUCycles) the address processor was stalled from sending write/atomic data further into the vL1D pipeline + - Percent +* - Data-Processor → Address Stall + - Percent of [total CU cycles](TotalCUCycles) the address processor was stalled waiting to send command data to the [data processor](TD) + - Percent +``` + + +(TA_inst)= +##### Instruction counts + +The address processor also counts instruction types to give the user information on what sorts of memory instructions were executed by the kernel. +These are broken down into a few major categories: + +```{list-table} +:header-rows: 1 +:widths: 20 20 60 +:class: noscroll-table +* - Memory type + - Usage + - Description +* - Global + - Global memory + - Global memory can be seen by all threads from a process. This includes the local accelerator's DRAM, remote accelerator's DRAM, and the host's DRAM. +* - Generic + - Dynamic address spaces + - Generic memory, a.k.a. "flat" memory, is used when the compiler cannot statically prove that a pointer is to memory in one or the other address spaces. The pointer could dynamically point into global, local, constant, or private memory. +* - Private Memory + - Register spills / Stack memory + - Private memory, a.k.a. "scratch" memory, is only visible to a particular [work-item](workitem) in a particular [workgroup](workgroup). On AMD Instinct(tm) MI accelerators, private memory is used to implement both register spills and stack memory accesses. +``` + +The address processor counts these instruction types as follows: + +```{list-table} +:header-rows: 1 +:widths: 18 65 17 +:class: noscroll-table + +* - Type + - Description + - Unit +* - Global/Generic + - The total number of global & generic memory instructions executed on all [compute units](CU) on the accelerator, per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - Global/Generic Read + - The total number of global & generic memory read instructions executed on all [compute units](CU) on the accelerator, per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - Global/Generic Write + - The total number of global & generic memory write instructions executed on all [compute units](CU) on the accelerator, per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - Global/Generic Atomic + - The total number of global & generic memory atomic (with and without return) instructions executed on all [compute units](CU) on the accelerator, per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - Spill/Stack + - The total number of spill/stack memory instructions executed on all [compute units](CU) on the accelerator, per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - Spill/Stack Read + - The total number of spill/stack memory read instructions executed on all [compute units](CU) on the accelerator, per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - Spill/Stack Write + - The total number of spill/stack memory write instructions executed on all [compute units](CU) on the accelerator, per [normalization-unit](normunit). + - Instruction per [normalization-unit](normunit) +* - Spill/Stack Atomic + - The total number of spill/stack memory atomic (with and without return) instructions executed on all [compute units](CU) on the accelerator, per [normalization-unit](normunit). Typically unused as these memory operations are typically used to implement thread-local storage. + - Instructions per [normalization-unit](normunit) +``` + +```{note} +The above is a simplified model specifically for the HIP programming language that does not consider (e.g.,) inline assembly usage, constant memory usage or texture memory. + +These categories correspond to: + - Global/Generic: global and flat memory operations, that are used for Global and Generic memory access. + - Spill/Stack: buffer instructions which are used on the MI50, MI100, and [MI2XX](2xxnote) accelerators for register spills / stack memory. + +These concepts are described in more detail in the [memory space section](Mspace) below, while generic memory access is explored in the [generic memory benchmark](flatmembench) section. +``` + +##### Spill/Stack metrics + +Finally, the address processing unit contains a separate coalescing stage for spill/stack memory, and thus reports: + +```{list-table} +:header-rows: 1 +:widths: 18 65 17 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Spill/Stack Total Cycles + - The number of cycles the address processing unit spent working on spill/stack instructions, per [normalization-unit](normunit). + - Cycles per [normalization-unit](normunit) +* - Spill/Stack Coalesced Read Cycles + - The number of cycles the address processing unit spent working on coalesced spill/stack read instructions, per [normalization-unit](normunit). + - Cycles per [normalization-unit](normunit) +* - Spill/Stack Coalesced Write Cycles + - The number of cycles the address processing unit spent working on coalesced spill/stack write instructions, per [normalization-unit](normunit) + - Cycles per [normalization-unit](normunit) +``` + +(UTCL1)= +#### L1 Unified Translation Cache (UTCL1) + +After a vector memory instruction has been processed/coalesced by the address processing unit of the vL1D, it must be translated from a virtual to physical address. +This process is handled by the L1 Unified Translation Cache (UTCL1). +This cache contains a L1 Translation Lookaside Buffer (TLB) which stores recently translated addresses to reduce the cost of subsequent re-translations. + +Omniperf reports the following L1 TLB metrics: + +```{list-table} +:header-rows: 1 +:widths: 18 65 17 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Requests + - The number of translation requests made to the UTCL1 per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +* - Hits + - The number of translation requests that hit in the UTCL1, and could be reused, per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +* - Hit Ratio + - The ratio of the number of translation requests that hit in the UTCL1 divided by the total number of translation requests made to the UTCL1. + - Percent +* - Translation Misses + - The total number of translation requests that missed in the UTCL1 due to translation not being present in the cache, per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +* - Permission Misses + - The total number of translation requests that missed in the UTCL1 due to a permission error, per [normalization-unit](normunit). This is unused and expected to be zero in most configurations for modern CDNA accelerators. + - Requests per [normalization-unit](normunit) +``` +```{note} +On current CDNA accelerators, such as the [MI2XX](2xxnote), the UTCL1 does _not_ count hit-on-miss requests. +``` + +(TC)= +#### Vector L1 Cache RAM (TC) + +After coalescing in the [address processing unit](TA) of the v1LD, and address translation in the [L1 TLB](UTCL1) the request proceeds to the Cache RAM stage of the pipeline. +Incoming requests are looked up in the cache RAMs using parts of the physical address as a tag. +Hits will be returned through the [data-return path](TD), while misses will routed out to the [L2 Cache](L2) for servicing. + +The metrics tracked by the vL1D RAM include: + + - Stall metrics + - Cache access metrics + - vL1D-L2 transaction detail metrics + +(TCP_cache_stall_metrics)= +##### vL1D cache stall metrics + +The vL1D also reports where it is stalled in the pipeline, which may indicate performance limiters of the cache. +A stall in the pipeline may result in backpressuring earlier parts of the pipeline, e.g., a stall on L2 requests may backpressure the wave-issue logic of the [VMEM](VALU) pipe and prevent it from issuing more vector memory instructions until the vL1D's outstanding requests are completed. + +```{list-table} +:header-rows: 1 +:widths: 20 65 15 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Stalled on L2 Data + - The ratio of the number of cycles where the vL1D is stalled waiting for requested data to return from the [L2 cache](L2) divided by the number of cycles where the [vL1D is active](vL1d_activity). + - Percent +* - Stalled on L2 Requests + - The ratio of the number of cycles where the vL1D is stalled waiting to issue a request for data to the [L2 cache](L2) divided by the number of cycles where the [vL1D is active](vL1d_activity). + - Percent +* - Tag RAM Stall (Read/Write/Atomic) + - The ratio of the number of cycles where the vL1D is stalled due to Read/Write/Atomic requests with conflicting tags being looked up concurrently, divided by the number of cycles where the [vL1D is active](vL1d_activity). + - Percent +``` + +(TCP_cache_access_metrics)= +##### vL1D cache access metrics + +The vL1D cache access metrics broadly indicate the type of requests incoming from the [cache frontend](TA), the number of requests that were serviced by the vL1D, and the number & type of outgoing requests to the [L2 cache](L2). In addition, this section includes the approximate latencies of accesses to the cache itself, along with latencies of read/write memory operations to the [L2 cache](L2). + +```{list-table} +:header-rows: 1 +:widths: 18 65 17 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Total Requests + - The total number of incoming requests from the [address processing unit](TA) after coalescing. + - Requests +* - Total read/write/atomic requests + - The total number of incoming read/write/atomic requests from the [address processing unit](TA) after coalescing per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +* - Cache Bandwidth + - The number of bytes looked up in the vL1D cache as a result of [VMEM](VALU) instructions per [normalization-unit](normunit). The number of bytes is calculated as the number of cache lines requested multiplied by the cache line size. This value does not consider partial requests, so e.g., if only a single value is requested in a cache line, the data movement will still be counted as a full cache line. + - Bytes per [normalization-unit](normunit) +* - Cache Hit Rate + - The ratio of the number of vL1D cache line requests that hit in vL1D cache over the total number of cache line requests to the [vL1D Cache RAM](TC). + - Percent +* - Cache Accesses + - The total number of cache line lookups in the vL1D. + - Cache lines +* - Cache Hits + - The number of cache accesses minus the number of outgoing requests to the [L2 cache](L2), i.e., the number of cache line requests serviced by the [vL1D Cache RAM](TC) per [normalization-unit](normunit). + - Cache lines per [normalization-unit](normunit) +* - Invalidations + - The number of times the vL1D was issued a write-back invalidate command during the kernel's execution per [normalization-unit](normunit). This may be triggered by, e.g., the `buffer_wbinvl1` instruction. + - Invalidations per [normalization-unit](normunit) +* - L1-L2 Bandwidth + - The number of bytes transferred across the vL1D-L2 interface as a result of [VMEM](VALU) instructions, per [normalization-unit](normunit). The number of bytes is calculated as the number of cache lines requested multiplied by the cache line size. This value does not consider partial requests, so e.g., if only a single value is requested in a cache line, the data movement will still be counted as a full cache line. + - Bytes per [normalization-unit](normunit) +* - L1-L2 Reads + - The number of read requests for a vL1D cache line that were not satisfied by the vL1D and must be retrieved from the to the [L2 Cache](L2) per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +* - L1-L2 Writes + - The number of post-coalescing write requests that are sent through the vL1D to the [L2 cache](L2), per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +* - L1-L2 Atomics + - The number of atomic requests that are sent through the vL1D to the [L2 cache](L2), per [normalization-unit](normunit). This includes requests for atomics with, and without return. + - Requests per [normalization-unit](normunit) +* - L1 Access Latency + - The average number of cycles that a vL1D cache line request spent in the vL1D cache pipeline. + - Cycles +* - L1-L2 Read Access Latency + - The average number of cycles that the vL1D cache took to issue and receive read requests from the [L2 Cache](L2). This number also includes requests for atomics with return values. + - Cycles +* - L1-L2 Write Access Latency + - The average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the [L2 Cache](L2). This number also includes requests for atomics without return values. + - Cycles +``` + +```{note} +All cache accesses in vL1D are for a single cache line's worth of data. +The size of a cache line may vary, however on current AMD Instinct(tm) MI CDNA accelerators and GCN GPUs the L1 cache line size is 64B. +``` + +(TCP_TCC_Transactions_Detail)= +##### vL1D - L2 Transaction Detail + +This section provides a more granular look at the types of requests made to the [L2 cache](L2). +These are broken down by the operation type (read / write / atomic, with, or without return), and the [memory type](Mtype). +For more detail, the reader is referred to the [Memory Types](Mtype) section. + + +(TD)= +#### Vector L1 Data-Return Path or Texture Data (TD) + +The data-return path of the vL1D cache, also known as the Texture Data (TD) unit, is responsible for routing data returned from the [vL1D cache RAM](TC) back to a wavefront on a SIMD. +As described in the [vL1D cache front-end](TA) section, the data-return path is passed information about the space requirements and routing for data requests from the [VALU](valu). +When data is returned from the [vL1D cache RAM](TC), it is matched to this previously stored request data, and returned to the appropriate SIMD. + +Omniperf reports the following vL1D data-return path metrics: + +```{list-table} +:header-rows: 1 +:widths: 18 65 17 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Data-return Busy + - Percent of the [total CU cycles](TotalCUCycles) the data-return unit was busy processing or waiting on data to return to the [CU](CU). + - Percent +* - Cache RAM → Data-return Stall + - Percent of the [total CU cycles](TotalCUCycles) the data-return unit was stalled on data to be returned from the [vL1D Cache RAM](TC). + - Percent +* - Workgroup manager → Data-return Stall + - Percent of the [total CU cycles](TotalCUCycles) the data-return unit was stalled by the [workgroup manager](SPI) due to initialization of registers as a part of launching new workgroups. + - Percent +* - Coalescable Instructions + - The number of instructions submitted to the [data-return unit](TD) by the [address-processor](TA) that were found to be coalescable, per [normalization-unit](normunit). + - Instructions per [normalization-unit](normunit) +* - Read Instructions + - The number of read instructions submitted to the [data-return unit](TD) by the [address-processor](TA) summed over all [compute units](CU) on the accelerator, per [normalization-unit](normunit). This is expected to be the sum of global/generic and spill/stack reads in the [address processor](TA_inst). + - Instructions per [normalization-unit](normunit) +* - Write Instructions + - The number of store instructions submitted to the [data-return unit](TD) by the [address-processor](TA) summed over all [compute units](CU) on the accelerator, per [normalization-unit](normunit). This is expected to be the sum of global/generic and spill/stack stores counted by the [vL1D cache-frontend](TA_inst). + - Instructions per [normalization-unit](normunit) +* - Atomic Instructions + - The number of atomic instructions submitted to the [data-return unit](TD) by the [address-processor](TA) summed over all [compute units](CU) on the accelerator, per [normalization-unit](normunit). This is expected to be the sum of global/generic and spill/stack atomics in the [address processor](TA_inst). + - Instructions per [normalization-unit](normunit) +``` + +(L2)= +## L2 Cache (TCC) + +The L2 cache is the coherence point for current AMD Instinct(tm) MI GCN GPUs and CDNA accelerators, and is shared by all [compute units](CU) on the device. +Besides serving requests from the [vector L1 data caches](vL1D), the L2 cache also is responsible for servicing requests from the [L1 instruction caches](L1I), the [scalar L1 data caches](sL1D) and the [command-processor](CP). +The L2 cache is composed of a number of distinct channels (32 on MI100/[MI2XX](2xxnote) series CDNA accelerators at 256B address interleaving) which can largely operate independently. +Mapping of incoming requests to a specific L2 channel is determined by a hashing mechanism that attempts to evenly distribute requests across the L2 channels. +Requests that miss in the L2 cache are passed out to [Infinity Fabric(tm)](l2fabric) to be routed to the appropriate memory location. + +The L2 cache metrics reported by Omniperf are broken down into four categories: + + - L2 Speed-of-Light + - L2 Cache Accesses + - L2-Fabric Transactions + - L2-Fabric Stalls + + +(L2SoL)= +### L2 Speed-of-Light + +```{warning} +The theoretical maximum throughput for some metrics in this section are currently computed with the maximum achievable clock frequency, as reported by `rocminfo`, for an accelerator. This may not be realistic for all workloads. +``` + +The L2 cache's speed-of-light table contains a few key metrics about the performance of the L2 cache, aggregated over all the L2 channels, as a comparison with the peak achievable values of those metrics: + +```{list-table} +:header-rows: 1 +:widths: 20 65 15 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Utilization + - The ratio of the [number of cycles an L2 channel was active, summed over all L2 channels on the accelerator](TotalActiveL2Cycles) over the [total L2 cycles](TotalL2Cycles). + - Percent +* - Bandwidth + - The number of bytes looked up in the L2 cache, as a percent of the peak theoretical bandwidth achievable on the specific accelerator. The number of bytes is calculated as the number of cache lines requested multiplied by the cache line size. This value does not consider partial requests, so e.g., if only a single value is requested in a cache line, the data movement will still be counted as a full cache line. + - Percent +* - Hit Rate + - The ratio of the number of L2 cache line requests that hit in the L2 cache over the total number of incoming cache line requests to the L2 cache. + - Percent +* - L2-Fabric Read BW + - The number of bytes read by the L2 over the [Infinity Fabric(tm) interface](l2fabric) per unit time. + - GB/s +* - L2-Fabric Write and Atomic BW + - The number of bytes sent by the L2 over the [Infinity Fabric(tm) interface](l2fabric) by write and atomic operations per unit time. + - GB/s +``` + +```{note} +The L2 cache on AMD Instinct(tm) MI CDNA accelerators uses a "hit-on-miss" approach to reporting cache hits. +That is, if while satisfying a miss, another request comes in that would hit on the same pending cache line, the subsequent request will be counted as a 'hit'. +Therefore, it is also important to consider the latency metric in the [L2-Fabric](l2fabric) section when evaluating the L2 hit rate. +``` + +(L2_cache_metrics)= +### L2 Cache Accesses + +This section details the incoming requests to the L2 cache from the [vL1D](vL1D) and other clients (e.g., the [sL1D](sL1D) and [L1I](L1I) caches). + +```{list-table} +:header-rows: 1 +:widths: 13 70 17 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Bandwidth + - The number of bytes looked up in the L2 cache, per [normalization-unit](normunit). The number of bytes is calculated as the number of cache lines requested multiplied by the cache line size. This value does not consider partial requests, so e.g., if only a single value is requested in a cache line, the data movement will still be counted as a full cache line. + - Bytes per [normalization-unit](normunit) +* - Requests + - The total number of incoming requests to the L2 from all clients for all request types, per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +* - Read Requests + - The total number of read requests to the L2 from all clients. + - Requests per [normalization-unit](normunit) +* - Write Requests + - The total number of write requests to the L2 from all clients. + - Requests per [normalization-unit](normunit) +* - Atomic Requests + - The total number of atomic requests (with and without return) to the L2 from all clients. + - Requests per [normalization-unit](normunit) +* - Streaming Requests + - The total number of incoming requests to the L2 that are marked as 'streaming'. The exact meaning of this may differ depending on the targeted accelerator, however on an [MI2XX](2xxnote) this corresponds to [non-temporal load or stores](https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins). The L2 cache attempts to evict 'streaming' requests before normal requests when the L2 is at capacity. + - Requests per [normalization-unit](normunit) +* - Probe Requests + - The number of coherence probe requests made to the L2 cache from outside the accelerator. On an [MI2XX](2xxnote), probe requests may be generated by e.g., writes to [fine-grained device](MType) memory or by writes to [coarse-grained](MType) device memory. + - Requests per [normalization-unit](normunit) +* - Hit Rate + - The ratio of the number of L2 cache line requests that hit in the L2 cache over the total number of incoming cache line requests to the L2 cache. + - Percent +* - Hits + - The total number of requests to the L2 from all clients that hit in the cache. As noted in the [speed-of-light](L2SoL) section, this includes hit-on-miss requests. + - Requests per [normalization-unit](normunit) +* - Misses + - The total number of requests to the L2 from all clients that miss in the cache. As noted in the [speed-of-light](L2SoL) section, these do not include hit-on-miss requests. + - Requests per [normalization-unit](normunit) +* - Writebacks + - The total number of L2 cache lines written back to memory for any reason. Write-backs may occur due to e.g., user-code (e.g., HIP kernel calls to `__threadfence_system`, or atomic built-ins), by the [command-processor](CP)'s memory acquire/release fences, or for other internal hardware reasons. + - Cache lines per [normalization-unit](normunit) +* - Writebacks (Internal) + - The total number of L2 cache lines written back to memory for internal hardware reasons, per [normalization-unit](normunit). + - Cache lines per [normalization-unit](normunit) +* - Writebacks (vL1D Req) + - The total number of L2 cache lines written back to memory due to requests initiated by the [vL1D cache](vL1D), per [normalization-unit](normunit). + - Cache lines per [normalization-unit](normunit) +* - Evictions (Normal) + - The total number of L2 cache lines evicted from the cache due to capacity limits, per [normalization-unit](normunit), per [normalization-unit](normunit). + - Cache lines per [normalization-unit](normunit) +* - Evictions (vL1D Req) + - The total number of L2 cache lines evicted from the cache due to invalidation requests initiated by the [vL1D cache](vL1D), per [normalization-unit](normunit). + - Cache lines per [normalization-unit](normunit) +* - Non-hardware-Coherent Requests + - The total number of requests to the L2 to Not-hardware-Coherent (NC) memory allocations, per [normalization-unit](normunit). See the [Memory Types section](Mtype) for more detail. + - Requests per [normalization-unit](normunit) +* - Uncached Requests + - The total number of requests to the L2 that to uncached (UC) memory allocations. See the [Memory Types section](Mtype) for more detail. + - Requests per [normalization-unit](normunit) +* - Coherently Cached Requests + - The total number of requests to the L2 that to coherently cachable (CC) memory allocations. See the [Memory Types section](Mtype) for more detail. + - Requests per [normalization-unit](normunit) +* - Read/Write Coherent Requests + - The total number of requests to the L2 that to Read-Write coherent memory (RW) allocations. See the [Memory Types section](Mtype) for more detail. + - Requests per [normalization-unit](normunit) +``` + +```{note} +All requests to the L2 are for a single cache line's worth of data. +The size of a cache line may vary depending on the accelerator, however on an AMD Instinct(tm) CDNA2 [MI2XX](2xxnote) accelerator, it is 128B, while on an MI100, it is 64B. +``` + +(l2fabric)= +### L2-Fabric transactions + +Requests/data that miss in the L2 must be routed to memory in order to service them. +The backing memory for a request may be local to this accelerator (i.e., in the local high-bandwidth memory), in a remote accelerator's memory, or even in the CPU's memory. +Infinity Fabric(tm) is responsible for routing these memory requests/data to the correct location and returning any fetched data to the L2 cache. +The [following section](L2_req_flow) describes the flow of these requests through Infinity Fabric(tm) in more detail, as described by Omniperf metrics, while [later sections](L2_req_metrics) give detailed definitions of individual metrics. + +(L2_req_flow)= +#### Request flow + +Below is a diagram that illustrates how L2↔Fabric requests are reported by Omniperf: + + +```{figure} images/fabric.png +:alt: L2↔Fabric transaction flow on AMD Instinct(tm) MI accelerators. +:align: center +:name: fabric-fig + +L2↔Fabric transaction flow on AMD Instinct(tm) MI accelerators. +``` + +Requests from the L2 Cache are broken down into two major categories, read requests and write requests (at this granularity, atomic requests are treated as writes). + +From there, these requests can additionally subdivided in a number of ways. +First, these requests may be sent across Infinity Fabric(tm) as different transaction sizes, 32B or 64B on current CDNA accelerators. + +```{note} +On current CDNA accelerators, the 32B read request path is expected to be unused (hence: is disconnected in the flow diagram). +``` + +In addition, the read and write requests can be further categorized as: + - uncached read/write requests, e.g., for accesses to [fine-grained memory](Mtype) + - atomic requests, e.g., for atomic updates to [fine-grained memory](Mtype) + - HBM read/write requests OR remote read/write requests, i.e., for requests to the accelerator's local HBM OR requests to a remote accelerator's HBM / the CPU's DRAM. + +These classifications are not necessarily _exclusive_, for example, a write request can be classified as an atomic request to the accelerator's local HBM, and an uncached write request. +The request-flow diagram marks _exclusive_ classifications as a splitting of the flow, while _non-exclusive_ requests do not split the flow line. +For example, a request is either a 32B Write Request OR a 64B Write request, as the flow splits at this point: +```{figure} images/split.* +:scale: 50 % +:alt: Request flow splitting +:align: center +:name: split-request-flow-fig + +Splitting request flow +``` +However, continuing along, the same request might be an Atomic request and an Uncached Write request, as reflected by a non-split flow: +```{figure} images/nosplit.* +:scale: 50 % +:alt: Request flow splitting +:align: center +:name: nosplit-request-flow-fig + +Non-splitting request flow +``` + +Finally, we note that [uncached](Mtype) read requests (e.g., to [fine-grained memory](Mtype)) are handled specially on CDNA accelerators, as indicated in the request flow diagram. +These are expected to be counted as a 64B Read Request, and _if_ they are requests to uncached memory (denoted by the dashed line), they will also be counted as _two_ uncached read requests (i.e., the request is split): + +```{figure} images/uncached.* +:scale: 50 % +:alt: Uncached read-request splitting +:align: center +:name: uncached-read-request-flow-fig + +Uncached read-request splitting. +``` + +(L2_req_metrics)= +#### Metrics + + +The following metrics are reported for the L2-Fabric interface: + +```{list-table} +:header-rows: 1 +:widths: 18 65 17 +:class: noscroll-table +* - Metric + - Description + - Unit +* - L2-Fabric Read Bandwidth + - The total number of bytes read by the L2 cache from Infinity Fabric(tm) per [normalization-unit](normunit). + - Bytes per [normalization-unit](normunit) +* - HBM Read Traffic + - The percent of read requests generated by the L2 cache that are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown does not consider the _size_ of the request (i.e., 32B and 64B requests are both counted as a single request), so this metric only _approximates_ the percent of the L2-Fabric Read bandwidth directed to the local HBM. + - Percent +* - Remote Read Traffic + - The percent of read requests generated by the L2 cache that are routed to any memory location other than the accelerator's local high-bandwidth memory (HBM) --- e.g., the CPU's DRAM, a remote accelerator's HBM, etc. This breakdown does not consider the _size_ of the request (i.e., 32B and 64B requests are both counted as a single request), so this metric only _approximates_ the percent of the L2-Fabric Read bandwidth directed to a remote location. + - Percent +* - Uncached Read Traffic + - The percent of read requests generated by the L2 cache that are reading from an [uncached memory allocation](Mtype). Note, as described in the [request-flow](L2_req_flow) section, a single 64B read request is typically counted as two uncached read requests, hence it is possible for the Uncached Read Traffic to reach up to 200% of the total number of read requests. This breakdown does not consider the _size_ of the request (i.e., 32B and 64B requests are both counted as a single request), so this metric only _approximates_ the percent of the L2-Fabric read bandwidth directed to an uncached memory location. + - Percent +* - L2-Fabric Write and Atomic Bandwidth + - The total number of bytes written by the L2 over Infinity Fabric(tm) by write and atomic operations per [normalization-unit](normunit). Note that on current CDNA accelerators, such as the [MI2XX](2xxnote), requests are only considered 'atomic' by Infinity Fabric(tm) if they are targeted at non-write-cachable memory, e.g., [fine-grained memory](Mtype) allocations or [uncached memory](Mtype) allocations on the [MI2XX](2xxnote). + - Bytes per [normalization-unit](normunit) +* - HBM Write and Atomic Traffic + - The percent of write and atomic requests generated by the L2 cache that are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown does not consider the _size_ of the request (i.e., 32B and 64B requests are both counted as a single request), so this metric only _approximates_ the percent of the L2-Fabric Write and Atomic bandwidth directed to the local HBM. Note that on current CDNA accelerators, such as the [MI2XX](2xxnote), requests are only considered 'atomic' by Infinity Fabric(tm) if they are targeted at [fine-grained memory](Mtype) allocations or [uncached memory](Mtype) allocations. + - Percent +* - Remote Write and Atomic Traffic + - The percent of write and atomic requests generated by the L2 cache that are routed to any memory location other than the accelerator's local high-bandwidth memory (HBM) --- e.g., the CPU's DRAM, a remote accelerator's HBM, etc. This breakdown does not consider the _size_ of the request (i.e., 32B and 64B requests are both counted as a single request), so this metric only _approximates_ the percent of the L2-Fabric Write and Atomic bandwidth directed to a remote location. Note that on current CDNA accelerators, such as the [MI2XX](2xxnote), requests are only considered 'atomic' by Infinity Fabric(tm) if they are targeted at non-write-cachable memory, e.g., [fine-grained memory](Mtype) allocations or [uncached memory](Mtype) allocations on the [MI2XX](2xxnote). + - Percent +* - Atomic Traffic + - The percent of write requests generated by the L2 cache that are atomic requests to _any_ memory location. This breakdown does not consider the _size_ of the request (i.e., 32B and 64B requests are both counted as a single request), so this metric only _approximates_ the percent of the L2-Fabric Write and Atomic bandwidth that is due to use of atomics. Note that on current CDNA accelerators, such as the [MI2XX](2xxnote), requests are only considered 'atomic' by Infinity Fabric(tm) if they are targeted at [fine-grained memory](Mtype) allocations or [uncached memory](Mtype) allocations. + - Percent +* - Uncached Write and Atomic Traffic + - The percent of write and atomic requests generated by the L2 cache that are targeting [uncached memory allocations](Mtype). This breakdown does not consider the _size_ of the request (i.e., 32B and 64B requests are both counted as a single request), so this metric only _approximates_ the percent of the L2-Fabric read bandwidth directed to uncached memory allocations. + - Percent +* - Read Latency + - The time-averaged number of cycles read requests spent in Infinity Fabric(tm) before data was returned to the L2. + - Cycles +* - Write Latency + - The time-averaged number of cycles write requests spent in Infinity Fabric(tm) before a completion acknowledgement was returned to the L2. + - Cycles +* - Atomic Latency + - The time-averaged number of cycles atomic requests spent in Infinity Fabric(tm) before a completion acknowledgement (atomic without return value) or data (atomic with return value) was returned to the L2. + - Cycles +* - Read Stall + - The ratio of the total number of cycles the L2-Fabric interface was stalled on a read request to any destination (local HBM, remote PCIe(r) connected accelerator / CPU, or remote Infinity Fabric(tm) connected accelerator{sup}`1` / CPU) over the [total active L2 cycles](TotalActiveL2Cycles). + - Percent +* - Write Stall + - The ratio of the total number of cycles the L2-Fabric interface was stalled on a write or atomic request to any destination (local HBM, remote accelerator / CPU, PCIe(r) connected accelerator / CPU, or remote Infinity Fabric(tm) connected accelerator{sup}`1` / CPU) over the [total active L2 cycles](TotalActiveL2Cycles). + - Percent +``` + +(L2_req_metric_details)= +#### Detailed Transaction Metrics + +The following metrics are available in the detailed L2-Fabric transaction breakdown table: + +```{list-table} +:header-rows: 1 +:widths: 18 65 17 +:class: noscroll-table +* - Metric + - Description + - Unit +* - 32B Read Requests + - The total number of L2 requests to Infinity Fabric(tm) to read 32B of data from any memory location, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail. Typically unused on CDNA accelerators. + - Requests per [normalization-unit](normunit) +* - Uncached Read Requests + - The total number of L2 requests to Infinity Fabric(tm) to read [uncached data](Mtype) from any memory location, per [normalization-unit](normunit). 64B requests for uncached data are counted as two 32B uncached data requests. See [request-flow](L2_req_flow) for more detail. + - Requests per [normalization-unit](normunit) +* - 64B Read Requests + - The total number of L2 requests to Infinity Fabric(tm) to read 64B of data from any memory location, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail. + - Requests per [normalization-unit](normunit) +* - HBM Read Requests + - The total number of L2 requests to Infinity Fabric(tm) to read 32B or 64B of data from the accelerator's local HBM, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail. + - Requests per [normalization-unit](normunit) +* - Remote Read Requests + - The total number of L2 requests to Infinity Fabric(tm) to read 32B or 64B of data from any source other than the accelerator's local HBM, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail. + - Requests per [normalization-unit](normunit) +* - 32B Write and Atomic Requests + - The total number of L2 requests to Infinity Fabric(tm) to write or atomically update 32B of data to any memory location, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail. + - Requests per [normalization-unit](normunit) +* - Uncached Write and Atomic Requests + - The total number of L2 requests to Infinity Fabric(tm) to write or atomically update 32B or 64B of [uncached data](Mtype), per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail. + - Requests per [normalization-unit](normunit) +* - 64B Write and Atomic Requests + - The total number of L2 requests to Infinity Fabric(tm) to write or atomically update 64B of data in any memory location, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail. + - Requests per [normalization-unit](normunit) +* - HBM Write and Atomic Requests + - The total number of L2 requests to Infinity Fabric(tm) to write or atomically update 32B or 64B of data in the accelerator's local HBM, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail. + - Requests per [normalization-unit](normunit) +* - Remote Write and Atomic Requests + - The total number of L2 requests to Infinity Fabric(tm) to write or atomically update 32B or 64B of data in any memory location other than the accelerator's local HBM, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail. + - Requests per [normalization-unit](normunit) +* - Atomic Requests + - The total number of L2 requests to Infinity Fabric(tm) to atomically update 32B or 64B of data in any memory location, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail. Note that on current CDNA accelerators, such as the [MI2XX](2xxnote), requests are only considered 'atomic' by Infinity Fabric(tm) if they are targeted at non-write-cachable memory, e.g., [fine-grained memory](Mtype) allocations or [uncached memory](Mtype) allocations on the [MI2XX](2xxnote). + - Requests per [normalization-unit](normunit) +``` + +### L2-Fabric Interface Stalls + +When the interface between the L2 cache and Infinity Fabric(tm) becomes backed up by requests, it may stall preventing the L2 from issuing additional requests to Infinity Fabric(tm) until prior requests complete. +This section gives a breakdown of what types of requests in a kernel caused a stall (e.g., read vs write), and to which locations (e.g., to the accelerator's local memory, or to remote accelerators/CPUs). + +```{list-table} +:header-rows: 1 +:widths: 20 65 15 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Read - PCIe(r) Stall + - The number of cycles the L2-Fabric interface was stalled on read requests to remote PCIe(r) connected accelerators{sup}`1` or CPUs as a percent of the [total active L2 cycles](TotalActiveL2Cycles). + - Percent +* - Read - Infinity Fabric(tm) Stall + - The number of cycles the L2-Fabric interface was stalled on read requests to remote Infinity Fabric(tm) connected accelerators{sup}`1` or CPUs as a percent of the [total active L2 cycles](TotalActiveL2Cycles). + - Percent +* - Read - HBM Stall + - The number of cycles the L2-Fabric interface was stalled on read requests to the accelerator's local HBM as a percent of the [total active L2 cycles](TotalActiveL2Cycles). + - Percent +* - Write - PCIe(r) Stall + - The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote PCIe(r) connected accelerators{sup}`1` or CPUs as a percent of the [total active L2 cycles](TotalActiveL2Cycles). + - Percent +* - Write - Infinity Fabric(tm) Stall + - The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote Infinity Fabric(tm) connected accelerators{sup}`1` or CPUs as a percent of the [total active L2 cycles](TotalActiveL2Cycles). + - Percent +* - Write - HBM Stall + - The number of cycles the L2-Fabric interface was stalled on write or atomic requests to accelerator's local HBM as a percent of the [total active L2 cycles](TotalActiveL2Cycles). + - Percent +* - Write - Credit Starvation + - The number of cycles the L2-Fabric interface was stalled on write or atomic requests to any memory location because too many write/atomic requests were currently in flight, as a percent of the [total active L2 cycles](TotalActiveL2Cycles). + - Percent +``` + +```{note} +{sup}`1` In addition to being used for on-accelerator data-traffic, AMD [Infinity Fabric](https://www.amd.com/en/technologies/infinity-architecture)(tm) technology can be used to connect multiple accelerators to achieve advanced peer-to-peer connectivity and enhanced bandwidths over traditional PCIe(r) connections. +Some AMD Instinct(tm) MI accelerators, e.g., the MI250X, [feature coherent CPU↔accelerator connections built using AMD Infinity Fabric(tm)](https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf) +``` + +```{warning} +On current CDNA accelerators and GCN GPUs, these L2↔Fabric stalls can be undercounted in some circumstances. +``` + +(SE)= +## Shader Engine (SE) + +The [CUs](CU) on a CDNA accelerator are grouped together into a higher-level organizational unit called a Shader Engine (SE): + +```{figure} images/selayout.png +:alt: Example of CU-grouping into shader-engines on AMD Instinct(tm) MI accelerators. +:align: center +:name: selayout-fig + +Example of CU-grouping into shader-engines on AMD Instinct(tm) MI accelerators. +``` + +The number of CUs on a SE varies from chip-to-chip (see, for example [AMD GPU HIP Training](https://www.olcf.ornl.gov/wp-content/uploads/2019/09/AMD_GPU_HIP_training_20190906.pdf), slide 20). +In addition, newer accelerators such as the AMD Instinct(tm) MI 250X have 8 SEs per accelerator. + +For the purposes of Omniperf, we consider resources that are shared between multiple CUs on a single SE as part of the SE's metrics. +These include: + - the [scalar L1 data cache](sL1D) + - the [L1 instruction cache](L1I) + - the [workgroup manager](SPI) + +(sL1D)= +### Scalar L1 Data Cache (sL1D) + +The Scalar L1 Data cache (sL1D) can cache data accessed from scalar load instructions (and scalar store instructions on architectures where they exist) from wavefronts in the [CUs](CU). +The sL1D is shared between multiple CUs ([GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah), slide 36) --- the exact number of CUs depends on the architecture in question (3 CUs in GCN GPUs and MI100, 2 CUs in [MI2XX](2xxnote)) --- and is backed by the [L2](L2) cache. + +In typical usage, the data in the sL1D is comprised of (e.g.,): + - Kernel arguments, e.g., pointers, [non-populated](https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-sgpr-register-set-up-order-table) grid/block dimensions, etc. + - HIP's `__constant__` memory, when accessed in a provably uniform{sup}`1` manner + - Other memory, when accessed in a provably uniform manner, *and* the backing memory is provably constant{sup}`1` + +```{note} +{sup}`1` +The scalar data cache is used when the compiler emits scalar loads to access data. +This requires that the data be _provably_ uniformly accessed (i.e., the compiler can verify that all work-items in a wavefront access the same data), _and_ that the data can be proven to be read-only (e.g., HIP's `__constant__` memory, or properly `__restrict__`'ed pointers to avoid write-aliasing). +Access of e.g., `__constant__` memory is not guaranteed to go through the sL1D if, e.g., the wavefront loads a non-uniform value. +``` + +(sL1D_SOL)= +#### Scalar L1D Speed-of-Light + +```{warning} +The theoretical maximum throughput for some metrics in this section are currently computed with the maximum achievable clock frequency, as reported by `rocminfo`, for an accelerator. This may not be realistic for all workloads. +``` + +The Scalar L1D speed-of-light chart shows some key metrics of the sL1D cache as a comparison with the peak achievable values of those metrics: + +```{list-table} +:header-rows: 1 +:widths: 20 65 15 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Bandwidth + - The number of bytes looked up in the sL1D cache, as a percent of the peak theoretical bandwidth. Calculated as the ratio of sL1D requests over the [total sL1D cycles](TotalSL1DCycles). + - Percent +* - Cache Hit Rate + - The percent of sL1D requests that hit{sup}`1` on a previously loaded line in the cache. Calculated as the ratio of the number of sL1D requests that hit over the number of all sL1D requests. + - Percent +* - sL1D-L2 BW + - The number of bytes requested by the sL1D from the L2 cache, as a percent of the peak theoretical sL1D → L2 cache bandwidth. Calculated as the ratio of the total number of requests from the sL1D to the L2 cache over the [total sL1D-L2 interface cycles](TotalSL1DCycles). + - Percent +``` + +```{note} +{sup}`1` Unlike the [vL1D](vL1D) and [L2](L2) caches, the sL1D cache on AMD Instinct(tm) MI CDNA accelerators does _not_ use "hit-on-miss" approach to reporting cache hits. +That is, if while satisfying a miss, another request comes in that would hit on the same pending cache line, the subsequent request will be counted as a 'duplicated miss' (see below). +``` + +#### Scalar L1D Cache Accesses + +This panel gives more detail on the types of accesses made to the sL1D, and the hit/miss statistics. + +```{list-table} +:header-rows: 1 +:widths: 18 65 17 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Requests + - The total number of requests, of any size or type, made to the sL1D per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +* - Hits + - The total number of sL1D requests that hit on a previously loaded cache line, per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +* - Misses - Non Duplicated + - The total number of sL1D requests that missed on a cache line that *was not* already pending due to another request, per [normalization-unit](normunit). See note in [speed-of-light section](sL1D_SOL) for more detail. + - Requests per [normalization-unit](normunit) +* - Misses - Duplicated + - The total number of sL1D requests that missed on a cache line that *was* already pending due to another request, per [normalization-unit](normunit). See note in [speed-of-light section](sL1D_SOL) for more detail. + - Requests per [normalization-unit](normunit) +* - Cache Hit Rate + - Indicates the percent of sL1D requests that hit on a previously loaded line the cache. The ratio of the number of sL1D requests that hit{sup}`1` over the number of all sL1D requests. + - Percent +* - Read Requests (Total) + - The total number of sL1D read requests of any size, per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +* - Atomic Requests + - The total number of sL1D atomic requests of any size, per [normalization-unit](normunit). Typically unused on CDNA accelerators. + - Requests per [normalization-unit](normunit) +* - Read Requests (1 DWord) + - The total number of sL1D read requests made for a single dword of data (4B), per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +* - Read Requests (2 DWord) + - The total number of sL1D read requests made for a two dwords of data (8B), per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +* - Read Requests (4 DWord) + - The total number of sL1D read requests made for a four dwords of data (16B), per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +* - Read Requests (8 DWord) + - The total number of sL1D read requests made for a eight dwords of data (32B), per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +* - Read Requests (16 DWord) + - The total number of sL1D read requests made for a sixteen dwords of data (64B), per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +``` + +```{note} +{sup}`1`Unlike the [vL1D](vL1D) and [L2](L2) caches, the sL1D cache on AMD Instinct(tm) MI CDNA accelerators does _not_ use "hit-on-miss" approach to reporting cache hits. +That is, if while satisfying a miss, another request comes in that would hit on the same pending cache line, the subsequent request will be counted as a 'duplicated miss' (see below). +``` + +#### sL1D ↔ L2 Interface + +This panel gives more detail on the data requested across the sL1D↔[L2](L2) interface. + +```{list-table} +:header-rows: 1 +:widths: 18 65 17 +:class: noscroll-table +* - Metric + - Description + - Unit +* - sL1D-L2 BW + - The total number of bytes read from/written to/atomically updated across the sL1D↔[L2](L2) interface, per [normalization-unit](normunit). Note that sL1D writes and atomics are typically unused on current CDNA accelerators, so in the majority of cases this can be interpreted as an sL1D→L2 read bandwidth. + - Bytes per [normalization-unit](normunit) +* - Read Requests + - The total number of read requests from sL1D to the [L2](L2), per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +* - Write Requests + - The total number of write requests from sL1D to the [L2](L2), per [normalization-unit](normunit). Typically unused on current CDNA accelerators. + - Requests per [normalization-unit](normunit) +* - Atomic Requests + - The total number of atomic requests from sL1D to the [L2](L2), per [normalization-unit](normunit). Typically unused on current CDNA accelerators. + - Requests per [normalization-unit](normunit) +* - Stall Cycles + - The total number of cycles the sL1D↔[L2](L2) interface was stalled, per [normalization-unit](normunit). + - Cycles per [normalization-unit](normunit) +``` + +(L1I)= +### L1 Instruction Cache (L1I) + +As with the [sL1D](sL1D), the L1 Instruction (L1I) cache is shared between multiple CUs on a shader-engine, where the precise number of CUs sharing a L1I depends on the architecture in question ([GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah), slide 36) and is backed by the [L2](L2) cache. +Unlike the sL1D, the instruction cache is read-only. + +(L1I_SOL)= +#### L1I Speed-of-Light + +```{warning} +The theoretical maximum throughput for some metrics in this section are currently computed with the maximum achievable clock frequency, as reported by `rocminfo`, for an accelerator. This may not be realistic for all workloads. +``` + +The L1 Instruction Cache speed-of-light chart shows some key metrics of the L1I cache as a comparison with the peak achievable values of those metrics: + +```{list-table} +:header-rows: 1 +:widths: 15 70 15 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Bandwidth + - The number of bytes looked up in the L1I cache, as a percent of the peak theoretical bandwidth. Calculated as the ratio of L1I requests over the [total L1I cycles](TotalL1ICycles). + - Percent +* - Cache Hit Rate + - The percent of L1I requests that hit on a previously loaded line the cache. Calculated as the ratio of the number of L1I requests that hit{sup}`1` over the number of all L1I requests. + - Percent +* - L1I-L2 BW + - The percent of the peak theoretical L1I → L2 cache request bandwidth achieved. Calculated as the ratio of the total number of requests from the L1I to the L2 cache over the [total L1I-L2 interface cycles](TotalL1ICycles). + - Percent +* - Instruction Fetch Latency + - The average number of cycles spent to fetch instructions to a [CU](cu). + - Cycles +``` + +```{note} +{sup}`1`Unlike the [vL1D](vL1D) and [L2](L2) caches, the L1I cache on AMD Instinct(tm) MI CDNA accelerators does _not_ use "hit-on-miss" approach to reporting cache hits. +That is, if while satisfying a miss, another request comes in that would hit on the same pending cache line, the subsequent request will be counted as a 'duplicated miss' (see below). +``` + +#### L1I Cache Accesses + +This panel gives more detail on the hit/miss statistics of the L1I: + +```{list-table} +:header-rows: 1 +:widths: 18 65 17 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Requests + - The total number of requests made to the L1I per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +* - Hits + - The total number of L1I requests that hit on a previously loaded cache line, per [normalization-unit](normunit). + - Requests per [normalization-unit](normunit) +* - Misses - Non Duplicated + - The total number of L1I requests that missed on a cache line that *was not* already pending due to another request, per [normalization-unit](normunit). See note in [speed-of-light section](L1I_SOL) for more detail. + - Requests per [normalization-unit](normunit) +* - Misses - Duplicated + - The total number of L1I requests that missed on a cache line that *was* already pending due to another request, per [normalization-unit](normunit). See note in [speed-of-light section](L1I_SOL) for more detail. + - Requests per [normalization-unit](normunit) +* - Cache Hit Rate + - The percent of L1I requests that hit{sup}`1` on a previously loaded line the cache. Calculated as the ratio of the number of L1I requests that hit over the the number of all L1I requests. + - Percent +``` + +```{note} +{sup}`1`Unlike the [vL1D](vL1D) and [L2](L2) caches, the L1I cache on AMD Instinct(tm) MI CDNA accelerators does _not_ use "hit-on-miss" approach to reporting cache hits. +That is, if while satisfying a miss, another request comes in that would hit on the same pending cache line, the subsequent request will be counted as a 'duplicated miss' (see below). +``` + +#### L1I - L2 Interface + +This panel gives more detail on the data requested across the L1I-[L2](L2) interface. + +```{list-table} +:header-rows: 1 +:widths: 18 65 17 +:class: noscroll-table +* - Metric + - Description + - Unit +* - L1I-L2 BW + - The total number of bytes read across the L1I-[L2](L2) interface, per [normalization-unit](normunit). + - Bytes per [normalization-unit](normunit) +``` + +(SPI)= +### Workgroup manager (SPI) + +The workgroup manager (SPI) is the bridge between the [command processor](CP) and the [compute units](CU). +After the [command processor](cp) processes a kernel dispatch, it will then pass the dispatch off to the workgroup manager, which then schedules [workgroups](workgroup) onto the [compute units](CU). +As workgroups complete execution and resources become available, the workgroup manager will schedule new workgroups onto [compute units](CU). +The workgroup manager's metrics therefore are focused on reporting, e.g.: + + - Utilizations of various parts of the accelerator that the workgroup manager interacts with (and the workgroup manager itself) + - How many workgroups were dispatched, their size, and how many resources they used + - Percent of scheduler opportunities (cycles) where workgroups failed to dispatch, and + - Percent of scheduler opportunities (cycles) where workgroups failed to dispatch due to lack of a specific resource on the CUs (e.g., too many VGPRs allocated) + +This gives the user an idea of why the workgroup manager couldn't schedule more wavefronts onto the device, and is most useful for workloads that the user suspects to be scheduling/launch-rate limited. + +As discussed in the [command processor](cp) description, the command processor on AMD Instinct(tm) MI architectures contains four hardware scheduler-pipes, each with eight software threads ([“Vega10” - Mantor](https://old.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.120-Radeon-Vega10-Mantor-AMD-f1.pdf), slide 19). +Each scheduler-pipe can issue a kernel dispatch to the workgroup manager to schedule concurrently. +Therefore, some workgroup manager metrics are presented relative to the utilization of these scheduler-pipes (e.g., whether all four are issuing concurrently). + +```{note} +Current versions of the profiling libraries underlying Omniperf attempt to serialize concurrent kernels running on the accelerator, as the performance counters on the device are global (i.e., shared between concurrent kernels). +This means that these scheduler-pipe utilization metrics are expected to reach e.g., a maximum of one pipe active, i.e., only 25\%. +``` + +#### Workgroup Manager Utilizations + +This section describes the utilization of the workgroup manager, and the hardware components it interacts with. + +```{list-table} +:header-rows: 1 +:widths: 20 65 15 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Accelerator Utilization + - The percent of cycles in the kernel where the accelerator was actively doing any work. + - Percent +* - Scheduler-Pipe Utilization + - The percent of [total scheduler-pipe cycles](TotalPipeCycles) in the kernel where the scheduler-pipes were actively doing any work. Note: this value is expected to range between 0-25%, see note in [workgroup-manager](SPI) description. + - Percent +* - Workgroup Manager Utilization + - The percent of cycles in the kernel where the Workgroup Manager was actively doing any work. + - Percent +* - Shader Engine Utilization + - The percent of [total shader-engine cycles](TotalSECycles) in the kernel where any CU in a shader-engine was actively doing any work, normalized over all shader-engines. Low values (e.g., << 100%) indicate that the accelerator was not fully saturated by the kernel, or a potential load-imbalance issue. + - Percent +* - SIMD Utilization + - The percent of [total SIMD cycles](TotalSIMDCycles) in the kernel where any [SIMD](VALU) on a CU was actively doing any work, summed over all CUs. Low values (e.g., << 100%) indicate that the accelerator was not fully saturated by the kernel, or a potential load-imbalance issue. + - Percent +* - Dispatched Workgroups + - The total number of workgroups forming this kernel launch. + - Workgroups +* - Dispatched Wavefronts + - The total number of wavefronts, summed over all workgroups, forming this kernel launch. + - Wavefronts +* - VGPR Writes + - The average number of cycles spent initializing [VGPRs](valu) at wave creation. + - Cycles/wave +* - SGPR Writes + - The average number of cycles spent initializing [SGPRs](salu) at wave creation. + - Cycles/wave +``` + +#### Workgroup Manager - Resource Allocation + +This panel gives more detail on how workgroups/wavefronts were scheduled onto compute units, and what occupancy limiters they hit (if any). +When analyzing these metrics, the user should also take into account their achieved occupancy (i.e., [Wavefront occupancy](Wavefront_runtime_stats)). +A kernel may be occupancy limited by e.g., LDS usage, but may still achieve high occupancy levels such that improving occupancy further may not improve performance. +See the [Workgroup Manager - Occupancy Limiters](Occupancy_example) example for more details. + +```{list-table} +:header-rows: 1 +:widths: 20 65 15 +:class: noscroll-table +* - Metric + - Description + - Unit +* - Not-scheduled Rate (Workgroup Manager) + - The percent of [total scheduler-pipe cycles](TotalPipeCycles) in the kernel where a workgroup could not be scheduled to a [CU](CU) due to a bottleneck within the workgroup manager rather than a lack of a [CU](CU)/[SIMD](VALU) with sufficient resources. Note: this value is expected to range between 0-25%, see note in [workgroup-manager](SPI) description. + - Percent +* - Not-scheduled Rate (Scheduler-Pipe) + - The percent of [total scheduler-pipe cycles](TotalPipeCycles) in the kernel where a workgroup could not be scheduled to a [CU](CU) due to a bottleneck within the scheduler-pipes rather than a lack of a [CU](CU)/[SIMD](VALU) with sufficient resources. Note: this value is expected to range between 0-25%, see note in [workgroup-manager](SPI) description. + - Percent +* - Scheduler-Pipe Stall Rate + - The percent of [total scheduler-pipe cycles](TotalPipeCycles) in the kernel where a workgroup could not be scheduled to a [CU](CU) due to occupancy limitations (i.e., a lack of a [CU](CU)/[SIMD](VALU) with sufficient resources). Note: this value is expected to range between 0-25%, see note in [workgroup-manager](SPI) description. + - Percent +* - Scratch Stall Rate + - The percent of [total shader-engine cycles](TotalSECycles) in the kernel where a workgroup could not be scheduled to a [CU](CU) due to lack of [private (a.k.a., scratch) memory](Mtype) slots. While this can reach up to 100\%, we note that the actual occupancy limitations on a kernel using private memory are typically quite small (e.g., <1\% of the total number of waves that can be scheduled to an accelerator). + - Percent +* - Insufficient SIMD Waveslots + - The percent of [total SIMD cycles](TotalSIMDCycles) in the kernel where a workgroup could not be scheduled to a [SIMD](valu) due to lack of available [waveslots](valu). + - Percent +* - Insufficient SIMD VGPRs + - The percent of [total SIMD cycles](TotalSIMDCycles) in the kernel where a workgroup could not be scheduled to a [SIMD](valu) due to lack of available [VGPRs](valu). + - Percent +* - Insufficient SIMD SGPRs + - The percent of [total SIMD cycles](TotalSIMDCycles) in the kernel where a workgroup could not be scheduled to a [SIMD](valu) due to lack of available [SGPRs](salu). + - Percent +* - Insufficient CU LDS + - The percent of [total CU cycles](TotalCUCycles) in the kernel where a workgroup could not be scheduled to a [CU](cu) due to lack of available [LDS](lds). + - Percent +* - Insufficient CU Barriers + - The percent of [total CU cycles](TotalCUCycles) in the kernel where a workgroup could not be scheduled to a [CU](cu) due to lack of available [barriers](barrier). + - Percent +* - Reached CU Workgroup Limit + - The percent of [total CU cycles](TotalCUCycles) in the kernel where a workgroup could not be scheduled to a [CU](cu) due to limits within the workgroup manager. This is expected to be always be zero on CDNA2 or newer accelerators (and small for previous accelerators). + - Percent +* - Reached CU Wavefront Limit + - The percent of [total CU cycles](TotalCUCycles) in the kernel where a wavefront could not be scheduled to a [CU](cu) due to limits within the workgroup manager. This is expected to be always be zero on CDNA2 or newer accelerators (and small for previous accelerators). + - Percent +``` + +(CP)= +## Command Processor (CP) + +The command processor -- a.k.a., the CP -- is responsible for interacting with the AMDGPU Kernel Driver (a.k.a., the Linux Kernel) on the CPU and for interacting with user-space HSA clients when they submit commands to HSA queues. +Basic tasks of the CP include reading commands (e.g., corresponding to a kernel launch) out of [HSA Queues](http://hsafoundation.com/wp-content/uploads/2021/02/HSA-Runtime-1.2.pdf) (Sec. 2.5), scheduling work to subsequent parts of the scheduler pipeline, and marking kernels complete for synchronization events on the host. + +The command processor is composed of two sub-components: + + - Fetcher (CPF): Fetches commands out of memory to hand them over to the CPC for processing + - Packet Processor (CPC): The micro-controller running the command processing firmware that decodes the fetched commands, and (for kernels) passes them to the [Workgroup Processors](SPI) for scheduling + +Before scheduling work to the accelerator, the command-processor can first acquire a memory fence to ensure system consistency [(Sec 2.6.4)](http://hsafoundation.com/wp-content/uploads/2021/02/HSA-Runtime-1.2.pdf). +After the work is complete, the command-processor can apply a memory-release fence. +Depending on the AMD CDNA accelerator under question, either of these operations _may_ initiate a cache write-back or invalidation. + +Analyzing command processor performance is most interesting for kernels that the user suspects to be scheduling/launch-rate limited. +The command processor's metrics therefore are focused on reporting, e.g.: + + - Utilization of the fetcher + - Utilization of the packet processor, and decoding processing packets + - Fetch/processing stalls + +### Command Processor Fetcher (CPF) Metrics + +```{list-table} +:header-rows: 1 +:widths: 20 65 15 +:class: noscroll-table +* - Metric + - Description + - Unit +* - CPF Utilization + - Percent of total cycles where the CPF was busy actively doing any work. The ratio of CPF busy cycles over total cycles counted by the CPF. + - Percent +* - CPF Stall + - Percent of CPF busy cycles where the CPF was stalled for any reason. + - Percent +* - CPF-L2 Utilization + - Percent of total cycles counted by the CPF-[L2](L2) interface where the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles over total cycles counted by the CPF-L2. + - Percent +* - CPF-L2 Stall + - Percent of CPF-L2 busy cycles where the CPF-[L2](L2) interface was stalled for any reason. + - Percent +* - CPF-UTCL1 Stall + - Percent of CPF busy cycles where the CPF was stalled by address translation. + - Percent +``` + +### Command Processor Packet Processor (CPC) Metrics + +```{list-table} +:header-rows: 1 +:widths: 20 65 15 +:class: noscroll-table +* - Metric + - Description + - Unit +* - CPC Utilization + - Percent of total cycles where the CPC was busy actively doing any work. The ratio of CPC busy cycles over total cycles counted by the CPC. + - Percent +* - CPC Stall + - Percent of CPC busy cycles where the CPC was stalled for any reason. + - Percent +* - CPC Packet Decoding Utilization + - Percent of CPC busy cycles spent decoding commands for processing. + - Percent +* - CPC-Workgroup Manager Utilization + - Percent of CPC busy cycles spent dispatching workgroups to the [Workgroup Manager](SPI). + - Percent +* - CPC-L2 Utilization + - Percent of total cycles counted by the CPC-[L2](L2) interface where the CPC-L2 interface was active doing any work. + - Percent +* - CPC-UTCL1 Stall + - Percent of CPC busy cycles where the CPC was stalled by address translation. + - Percent +* - CPC-UTCL2 Utilization + - Percent of total cycles counted by the CPC's L2 address translation interface where the CPC was busy doing address translation work. + - Percent +``` + +## System Speed-of-Light + +```{warning} +The theoretical maximum throughput for some metrics in this section are currently computed with the maximum achievable clock frequency, as reported by `rocminfo`, for an accelerator. This may not be realistic for all workloads. + +In addition, not all metrics (e.g., FLOP counters) are available on all AMD Instinct(tm) MI accelerators. +For more detail on how operations are counted, see the [FLOP counting convention](FLOP_count) section. +``` + +Finally, the system speed-of-light summarizes some of the key metrics from various sections of Omniperf's profiling report. + +```{list-table} +:header-rows: 1 +:widths: 20 65 15 +:class: noscroll-table +* - Metric + - Description + - Unit +* - [VALU](valu) FLOPs + - The total floating-point operations executed per second on the [VALU](valu). This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from [MFMA](mfma) instructions. + - GFLOPs +* - [VALU](valu) IOPs + - The total integer operations executed per second on the [VALU](valu). This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from [MFMA](mfma) instructions. + - GIOPs +* - [MFMA](mfma) FLOPs (BF16) + - The total number of 16-bit brain floating point [MFMA](mfma) operations executed per second. Note: this does not include any 16-bit brain floating point operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the specific accelerator. + - GFLOPs +* - [MFMA](mfma) FLOPs (F16) + - The total number of 16-bit floating point [MFMA](mfma) operations executed per second. Note: this does not include any 16-bit floating point operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific accelerator. + - GFLOPs +* - [MFMA](mfma) FLOPs (F32) + - The total number of 32-bit floating point [MFMA](mfma) operations executed per second. Note: this does not include any 32-bit floating point operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific accelerator. + - GFLOPs +* - [MFMA](mfma) FLOPs (F64) + - The total number of 64-bit floating point [MFMA](mfma) operations executed per second. Note: this does not include any 64-bit floating point operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator. + - GFLOPs +* - [MFMA](mfma) IOPs (INT8) + - The total number of 8-bit integer [MFMA](mfma) operations executed per second. Note: this does not include any 8-bit integer operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator. + - GIOPs +* - [SALU](salu) Utilization + - Indicates what percent of the kernel's duration the [SALU](salu) was busy executing instructions. Computed as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [SALU](salu) / [SMEM](salu) instructions over the [total CU cycles](TotalCUCycles). + - Percent +* - [VALU](valu) Utilization + - Indicates what percent of the kernel's duration the [VALU](valu) was busy executing instructions. Does not include [VMEM](valu) operations. Computed as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [VALU](valu) instructions over the [total CU cycles](TotalCUCycles). + - Percent +* - [MFMA](mfma) Utilization + - Indicates what percent of the kernel's duration the [MFMA](mfma) unit was busy executing instructions. Computed as the ratio of the total number of cycles the [MFMA](mfma) was busy over the [total CU cycles](TotalCUCycles). + - Percent +* - [VMEM](valu) Utilization + - Indicates what percent of the kernel's duration the [VMEM](valu) unit was busy executing instructions, including both global/generic and spill/scratch operations (see the [VMEM instruction count metrics](TA_inst) for more detail). Does not include [VALU](valu) operations. Computed as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [VMEM](valu) instructions over the [total CU cycles](TotalCUCycles). + - Percent +* - [Branch](branch) Utilization + - Indicates what percent of the kernel's duration the [Branch](branch) unit was busy executing instructions. Computed as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [Branch](branch) instructions over the [total CU cycles](TotalCUCycles). + - Percent +* - [VALU](valu) Active Threads + - Indicates the average level of [divergence](Divergence) within a wavefront over the lifetime of the kernel. The number of work-items that were active in a wavefront during execution of each [VALU](valu) instruction, time-averaged over all VALU instructions run on all wavefronts in the kernel. + - Work-items +* - IPC + - The ratio of the total number of instructions executed on the [CU](cu) over the [total active CU cycles](TotalActiveCUCycles). This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + - Instructions per-cycle +* - Wavefront Occupancy + - The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (<< 1ms). This is also presented as a percent of the peak theoretical occupancy achievable on the specific accelerator. + - Wavefronts +* - [LDS](lds) Theoretical Bandwidth + - Indicates the maximum amount of bytes that could have been loaded from/stored to/atomically updated in the LDS per unit time (see [LDS Bandwidth](lds_bandwidth) example for more detail). This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator. + - GB/s +* - [LDS](lds) Bank Conflicts/Access + - The ratio of the number of cycles spent in the [LDS scheduler](lds) due to bank conflicts (as determined by the conflict resolution hardware) to the base number of cycles that would be spent in the LDS scheduler in a completely uncontended case. This is also presented in normalized form (i.e., the Bank Conflict Rate). + - Conflicts/Access +* - [vL1D](vL1D) Cache Hit Rate + - The ratio of the number of vL1D cache line requests that hit in vL1D cache over the total number of cache line requests to the [vL1D Cache RAM](TC). + - Percent +* - [vL1D](vL1D) Cache Bandwidth + - The number of bytes looked up in the vL1D cache as a result of [VMEM](VALU) instructions per unit time. The number of bytes is calculated as the number of cache lines requested multiplied by the cache line size. This value does not consider partial requests, so e.g., if only a single value is requested in a cache line, the data movement will still be counted as a full cache line. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + - GB/s +* - [L2](L2) Cache Hit Rate + - The ratio of the number of L2 cache line requests that hit in the L2 cache over the total number of incoming cache line requests to the L2 cache. + - Percent +* - [L2](L2) Cache Bandwidth + - The number of bytes looked up in the L2 cache per unit time. The number of bytes is calculated as the number of cache lines requested multiplied by the cache line size. This value does not consider partial requests, so e.g., if only a single value is requested in a cache line, the data movement will still be counted as a full cache line. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + - GB/s +* - [L2](L2)-Fabric Read BW + - The number of bytes read by the L2 over the [Infinity Fabric(tm) interface](l2fabric) per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + - GB/s +* - [L2](L2)-Fabric Write and Atomic BW + - The number of bytes sent by the L2 over the [Infinity Fabric(tm) interface](l2fabric) by write and atomic operations per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + - GB/s +* - [L2](L2)-Fabric Read Latency + - The time-averaged number of cycles read requests spent in Infinity Fabric(tm) before data was returned to the L2. + - Cycles +* - [L2](L2)-Fabric Write Latency + - The time-averaged number of cycles write requests spent in Infinity Fabric(tm) before a completion acknowledgement was returned to the L2. + - Cycles +* - [sL1D](sL1D) Cache Hit Rate + - The percent of sL1D requests that hit on a previously loaded line the cache. Calculated as the ratio of the number of sL1D requests that hit over the number of all sL1D requests. + - Percent +* - [sL1D](sL1D) Bandwidth + - The number of bytes looked up in the sL1D cache per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + - GB/s +* - [L1I](L1I) Bandwidth + - The number of bytes looked up in the L1I cache per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + - GB/s +* - [L1I](L1I) Cache Hit Rate + - The percent of L1I requests that hit on a previously loaded line the cache. Calculated as the ratio of the number of L1I requests that hit over the number of all L1I requests. + - Percent +* - [L1I](L1I) Fetch Latency + - The average number of cycles spent to fetch instructions to a [CU](cu). + - Cycles +``` + +## References + +- [AMD GPU HIP Training](https://www.olcf.ornl.gov/wp-content/uploads/2019/09/AMD_GPU_HIP_training_20190906.pdf) +- [CDNA2 ISA Documentation](https://developer.amd.com/wp-content/resources/CDNA2_Shader_ISA_4February2022.pdf) +- [HSA Runtime Programmer’s Reference Manual](http://hsafoundation.com/wp-content/uploads/2021/02/HSA-Runtime-1.2.pdf) +- [GS-4106 The AMD GCN Architecture - A Crash Course, by Layla Mah](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah) +- [AMD RADEON™ HD 7970 WITH GRAPHICS CORE NEXT (GCN) ARCHITECTURE, by Mike Mantor](https://old.hotchips.org/wp-content/uploads/hc_archives/hc24/HC24-3-ManyCore/HC24.28.315-AMD.GCN.mantor_v1.pdf) +- [AMD’s Radeon Next Generation GPU Architecture “Vega10”, by Mike Mantor](https://old.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.120-Radeon-Vega10-Mantor-AMD-f1.pdf) +- [CDNA2 Whitepaper](https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf) +- [LLVM's User Guide for AMDGPU Backend](https://llvm.org/docs/AMDGPUUsage.html) + +## Disclaimer + +PCIe(r) is a registered trademark of PCI-SIG Corporation. + + +# Definitions + +## Miscellaneous + +(TotalActiveCUCycles)= +(TotalCUCycles)= +(TotalSL1DCycles)= +(TotalL1ICycles)= +(TotalL2Cycles)= +(TotalActiveL2Cycles)= +(TotalPipeCycles)= +(TotalSECycles)= +(TotalSIMDCycles)= +(ThreadRequests)= +(Wavefront)= +(Workitem)= +(Workgroup)= +(Divergence)= +(KernelCycles)= +(KernelTime)= + +```{list-table} +:header-rows: 1 +:widths: 20 65 15 +:class: noscroll-table +* - Name + - Description + - Unit +* - Kernel Time + - The number of seconds the accelerator was executing a kernel, from the [Command Processor](CP)'s start-of-kernel timestamp (which is a number of cycles after the CP begins processing the packet) to the CP's end-of-kernel timestamp (which is a number of cycles before the CP stops processing the packet. + - Seconds +* - Kernel Cycles + - The number of cycles the accelerator was active doing _any_ work, as measured by the [Command Processor](CP). + - Cycles +* - Total CU Cycles + - The number of cycles the accelerator was active doing _any_ work (i.e., Kernel Cycles), multiplied by the number of [compute units](CU) on the accelerator. A measure of the total possible active cycles the compute units could be doing work, useful for normalization of metrics inside the CU. + - Cycles +* - Total Active CU Cycles + - The number of cycles a CU on the accelerator was active doing _any_ work, summed over all [compute units](CU) on the accelerator. + - Cycles +* - Total SIMD Cycles + - The number of cycles the accelerator was active doing _any_ work (i.e., Kernel Cycles), multiplied by the number of [SIMDs](CU) on the accelerator. A measure of the total possible active cycles the SIMDs could be doing work, useful for normalization of metrics inside the CU. + - Cycles +* - Total L2 Cycles + - The number of cycles the accelerator was active doing _any_ work (i.e., Kernel Cycles), multiplied by the number of [L2](L2) channels on the accelerator. A measure of the total possible active cycles the L2 channels could be doing work, useful for normalization of metrics inside the L2. + - Cycles +* - Total Active L2 Cycles + - The number of cycles a channel of the L2 cache was active doing _any_ work, summed over all [L2](L2) channels on the accelerator. + - Cycles +* - Total sL1D Cycles + - The number of cycles the accelerator was active doing _any_ work (i.e., Kernel Cycles), multiplied by the number of [scalar L1 Data caches](sL1D) on the accelerator. A measure of the total possible active cycles the sL1Ds could be doing work, useful for normalization of metrics inside the sL1D. + - Cycles +* - Total L1I Cycles + - The number of cycles the accelerator was active doing _any_ work (i.e., Kernel Cycles), multiplied by the number of [L1 Instruction caches](L1I) on the accelerator. A measure of the total possible active cycles the L1Is could be doing work, useful for normalization of metrics inside the L1I. + - Cycles +* - Total Scheduler-Pipe Cycles + - The number of cycles the accelerator was active doing _any_ work (i.e., Kernel Cycles), multiplied by the number of [scheduler pipes](CP) on the accelerator. A measure of the total possible active cycles the scheduler-pipes could be doing work, useful for normalization of metrics inside the [workgroup manager](SPI) and [command processor](CP). + - Cycles +* - Total Shader-Engine Cycles + - The total number of cycles the accelerator was active doing _any_ work, multiplied by the number of [Shader Engines](SE) on the accelerator. A measure of the total possible active cycles the Shader Engines could be doing work, useful for normalization of metrics inside the [workgroup manager](SPI). + - Cycles +* - Thread-requests + - The number of unique memory addresses accessed by a single memory instruction. On AMD's Instinct(tm) accelerators, this a maximum of 64 (i.e., the size of the wavefront). + - Addresses +* - Work-item + - A single 'thread' (lane) of execution, that executes in lockstep with the rest of the work-items comprising a [wavefront](Wavefront) of execution. + - N/A +* - Wavefront + - A group of work-items, or threads, that execute in lockstep on the [compute-unit](CU). On AMD's Instinct(tm) accelerators, the wavefront size is always 64 work-items. + - N/A +* - Workgroup + - A group of wavefronts that execute on the same [compute-unit](CU), and can cooperatively execute and share data via the use of synchronization primitives, [LDS](lds), atomics, etc. + - N/A +* - Divergence + - Divergence within a wavefront occurs when not all work-items are active when executing an instruction, e.g., due to non-uniform control flow within a wavefront. Can reduce overall execution efficiency by causing e.g., the [VALU](valu) to have to execute both branches of a conditional with different sets of work-items active. + - N/A +``` + +(normunit)= +## Normalization units + +A user-configurable unit by which the user can choose to normalize data. Choices include: + +```{list-table} +:header-rows: 1 +:widths: 20 80 +:class: noscroll-table +* - Name + - Description +* - `per_cycle` + - The total value of the measured counter/metric that occurred per kernel invocation divided by the [Kernel Cycles](KernelCycles), i.e., total number of cycles the kernel executed as measured by the [Command Processor](CP). +* - `per_wave` + - The total value of the measured counter/metric that occurred per kernel invocation divided by the total number of [wavefronts](wavefront) launched in the kernel. +* - `per_kernel` + - The total value of the measured counter/metric that occurred per kernel invocation. +* - `per_second` + - The total value of the measured counter/metric that occurred per kernel invocation divided by the [Kernel Time](KernelTime), i.e., the total runtime of the kernel in seconds, as measured by the [Command Processor](CP). +``` + +By default, Omniperf uses the `per_wave` normalization. The appropriate normalization will vary depending on your use case. +For instance, a `per_second` normalization may be useful for FLOP or bandwidth comparisons, while a `per_wave` normalization may be useful (e.g.,) to see how many (and what types) of instructions are used per wavefront, and a `per_kernel` normalization may be useful to get the total aggregate values of metrics for comparison between different configurations. + +(Mspace)= +## Memory Spaces + +AMD Instinct(tm) MI accelerators can access memory through multiple address spaces which may map to different physical memory locations on the system. +The [table below](mspace-table) provides a view of how various types of memory used in HIP map onto these constructs: + +```{list-table} Memory / Address space terminology +:header-rows: 1 +:name: mspace-table +:class: noscroll-table + +* - LLVM Address Space + - Hardware Memory Space + - HIP Terminology +* - Generic + - Flat + - N/A +* - Global + - Global + - Global +* - Local + - LDS + - LDS/Shared +* - Private + - Scratch + - Private +* - Constant + - Same as global + - Constant +``` + +Below is a high-level description of the address spaces in the AMDGPU backend of LLVM: + +```{list-table} +:header-rows: 1 +:widths: 20 80 +:class: noscroll-table + +* - Address space + - Description +* - Global + - Memory that can be seen by all threads in a process, and may be backed by the local accelerator's HBM, a remote accelerator's HBM, or the CPU's DRAM. +* - Local + - Memory that is only visible to a particular workgroup. On AMD's Instinct(tm) accelerator hardware, this is stored in [LDS](LDS) memory. +* - Private + - Memory that is only visible to a particular [work-item](workitem) (thread), stored in the scratch space on AMD's Instinct(tm) accelerators. +* - Constant + - Read-only memory that is in the global address space and stored on the local accelerator's HBM. +* - Generic + - Used when the compiler cannot statically prove that a pointer is addressing memory in a single (non-generic) address space. Mapped to Flat on AMD's Instinct(tm) accelerators, the pointer could dynamically address global, local, private or constant memory. +``` + +[LLVM's documentation for AMDGPU Backend](https://llvm.org/docs/AMDGPUUsage.html#address-spaces) will always have the most up-to-date information, and the interested reader is referred to this source for a more complete explanation. + +(Mtype)= +## Memory Type + +AMD Instinct(tm) accelerators contain a number of different memory allocation types to enable the HIP language's [memory coherency model](https://rocm.docs.amd.com/projects/HIP/en/latest/user_guide/programming_manual.html#coherency-controls). +These memory types are broadly similar between AMD Instinct(tm) accelerator generations, but may differ in exact implementation. + +In addition, these memory types _may_ differ between accelerators on the same system, even when accessing the same memory allocation. +For example, an [MI2XX](2xxnote) accelerator accessing "fine-grained" memory allocated local to that device may see the allocation as coherently cachable, while a remote accelerator might see the same allocation as uncached. + +These memory types include: + +```{list-table} +:header-rows: 1 +:widths: 20 80 +:class: noscroll-table + * - Memory type + - Description + * - Uncached Memory (UC) + - Memory that will not be cached in this accelerator. On [MI2XX](2xxnote) accelerators, this corresponds "fine-grained" (a.k.a., "coherent") memory allocated on a remote accelerator or the host, e.g., using `hipHostMalloc` or `hipMallocManaged` with default allocation flags. + * - Non-hardware-Coherent Memory (NC) + - Memory that will be cached by the accelerator, and is only guaranteed to be consistent at kernel boundaries / after software-driven synchronization events. On [MI2XX](2xxnote) accelerators, this type of memory maps to (e.g.,) "coarse-grained" `hipHostMalloc`'d memory (i.e., allocated with the `hipHostMallocNonCoherent` flag), or `hipMalloc`'d memory allocated on a remote accelerator. + * - Coherently Cachable (CC) + - Memory for which only reads from the accelerator where the memory was allocated will be cached. Writes to CC memory are uncached, and trigger invalidations of any line within this accelerator. On [MI2XX](2xxnote) accelerators, this type of memory maps to "fine-grained" memory allocated on the local accelerator using, e.g., the `hipExtMallocWithFlags` API using the `hipDeviceMallocFinegrained` flag. + * - Read/Write Coherent Memory (RW) + - Memory that will be cached by the accelerator, but may be invalidated by writes from remote devices at kernel boundaries / after software-driven synchronization events. On [MI2XX](2xxnote) accelerators, this corresponds to "coarse-grained" memory allocated locally to the accelerator, using e.g., the default `hipMalloc` allocator. +``` + +A good discussion of coarse and fine grained memory allocations and what type of memory is returned by various combinations of memory allocators, flags and arguments can be found in the [Crusher Quick-Start Guide](https://docs.olcf.ornl.gov/systems/crusher_quick_start_guide.html#floating-point-fp-atomic-operations-and-coarse-fine-grained-memory-allocations). + +(profiling-with-omniperf)= +# Profiling with Omniperf by Example + +(VALU_inst_mix_example)= +## VALU Arithmetic Instruction Mix + +For this example, we consider the [instruction mix sample](https://github.com/AMDResearch/omniperf/blob/dev/sample/instmix.hip) distributed as a part of Omniperf. + +```{note} +This example is expected to work on all CDNA accelerators, however the results in this section were collected on an [MI2XX](2xxnote) accelerator +``` + +### Design note + +This code uses a number of inline assembly instructions to cleanly identify the types of instructions being issued, as well as to avoid optimization / dead-code elimination by the compiler. +While inline assembly is inherently unportable, this example is expected to work on all GCN GPUs and CDNA accelerators. + +We reproduce a sample of the kernel below: + +```c++ + // fp32: add, mul, transcendental and fma + float f1, f2; + asm volatile( + "v_add_f32_e32 %0, %1, %0\n" + "v_mul_f32_e32 %0, %1, %0\n" + "v_sqrt_f32 %0, %1\n" + "v_fma_f32 %0, %1, %0, %1\n" + : "=v"(f1) + : "v"(f2)); +``` + +These instructions correspond to: + - A 32-bit floating point addition, + - A 32-bit floating point multiplication, + - A 32-bit floating point square-root transcendental operation, and + - A 32-bit floating point fused multiply-add operation. + +For more detail, the reader is referred to (e.g.,) the [CDNA2 ISA Guide](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf). + +### Instruction mix + +This example was compiled and run on a MI250 accelerator using ROCm v5.6.0, and Omniperf v2.0.0. +```shell-session +$ hipcc -O3 instmix.hip -o instmix +``` + +We generate our profile for this example via: +```shell-session +$ omniperf profile -n instmix --no-roof -- ./instmix +``` + +and finally, analyze the instruction mix section: +```shell-session +$ omniperf analyze -p workloads/instmix/mi200/ -b 10.2 +<...> +10. Compute Units - Instruction Mix +10.2 VALU Arithmetic Instr Mix +╒═════════╤════════════╤═════════╤════════════════╕ +│ Index │ Metric │ Count │ Unit │ +╞═════════╪════════════╪═════════╪════════════════╡ +│ 10.2.0 │ INT32 │ 1.00 │ Instr per wave │ +├─────────┼────────────┼─────────┼────────────────┤ +│ 10.2.1 │ INT64 │ 1.00 │ Instr per wave │ +├─────────┼────────────┼─────────┼────────────────┤ +│ 10.2.2 │ F16-ADD │ 1.00 │ Instr per wave │ +├─────────┼────────────┼─────────┼────────────────┤ +│ 10.2.3 │ F16-MUL │ 1.00 │ Instr per wave │ +├─────────┼────────────┼─────────┼────────────────┤ +│ 10.2.4 │ F16-FMA │ 1.00 │ Instr per wave │ +├─────────┼────────────┼─────────┼────────────────┤ +│ 10.2.5 │ F16-Trans │ 1.00 │ Instr per wave │ +├─────────┼────────────┼─────────┼────────────────┤ +│ 10.2.6 │ F32-ADD │ 1.00 │ Instr per wave │ +├─────────┼────────────┼─────────┼────────────────┤ +│ 10.2.7 │ F32-MUL │ 1.00 │ Instr per wave │ +├─────────┼────────────┼─────────┼────────────────┤ +│ 10.2.8 │ F32-FMA │ 1.00 │ Instr per wave │ +├─────────┼────────────┼─────────┼────────────────┤ +│ 10.2.9 │ F32-Trans │ 1.00 │ Instr per wave │ +├─────────┼────────────┼─────────┼────────────────┤ +│ 10.2.10 │ F64-ADD │ 1.00 │ Instr per wave │ +├─────────┼────────────┼─────────┼────────────────┤ +│ 10.2.11 │ F64-MUL │ 1.00 │ Instr per wave │ +├─────────┼────────────┼─────────┼────────────────┤ +│ 10.2.12 │ F64-FMA │ 1.00 │ Instr per wave │ +├─────────┼────────────┼─────────┼────────────────┤ +│ 10.2.13 │ F64-Trans │ 1.00 │ Instr per wave │ +├─────────┼────────────┼─────────┼────────────────┤ +│ 10.2.14 │ Conversion │ 1.00 │ Instr per wave │ +╘═════════╧════════════╧═════════╧════════════════╛ +``` + +shows that we have exactly one of each type of VALU arithmetic instruction, by construction! + +(Fabric_transactions_example)= +## Infinity-Fabric(tm) transactions + +For this example, we consider the [Infinity Fabric(tm) sample](https://github.com/AMDResearch/omniperf/blob/dev/sample/fabric.hip) distributed as a part of Omniperf. +This code launches a simple read-only kernel, e.g.: + +```c++ +// the main streaming kernel +__global__ void kernel(int* x, size_t N, int zero) { + int sum = 0; + const size_t offset_start = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = 0; i < 10; ++i) { + for (size_t offset = offset_start; offset < N; offset += blockDim.x * gridDim.x) { + sum += x[offset]; + } + } + if (sum != 0) { + x[offset_start] = sum; + } +} +``` + +twice; once as a warmup, and once for analysis. +We note that the buffer `x` is initialized to all zeros via a call to `hipMemcpy` on the host before the kernel is ever launched, therefore the conditional: + +```c++ +if (sum != 0) { ... +``` + +is identically false (and thus: we expect no writes). + +```{note} +The actual sample included with Omniperf also includes the ability to select different operation types, e.g., atomics, writes, etc. +This abbreviated version is presented here for reference only. +``` + +Finally, this sample code lets the user control: + - The [granularity of an allocation](Mtype), + - The owner of an allocation (local HBM, CPU DRAM or remote HBM), and + - The size of an allocation (the default is $\sim4$GiB) + +via command line arguments. +In doing so, we can explore the impact of these parameters on the L2-Fabric metrics reported by Omniperf to further understand their meaning. + +All results in this section were generated an a node of Infinity Fabric(tm) connected MI250 accelerators using ROCm v5.6.0, and Omniperf v2.0.0. +Although results may vary with ROCm versions and accelerator connectivity, we expect the lessons learned here to be broadly applicable. + +(Fabric_exp_1)= +### Experiment #1 - Coarse-grained, accelerator-local HBM reads + +In our first experiment, we consider the simplest possible case, a `hipMalloc`'d buffer that is local to our current accelerator: + +```shell-session +$ omniperf profile -n coarse_grained_local --no-roof -- ./fabric -t 1 -o 0 +Using: + mtype:CoarseGrained + mowner:Device + mspace:Global + mop:Read + mdata:Unsigned + remoteId:-1 +<...> +$ omniperf analyze -p workloads/coarse_grained_local/mi200 -b 17.2.0 17.2.1 17.2.2 17.4.0 17.4.1 17.4.2 17.5.0 17.5.1 17.5.2 17.5.3 17.5.4 -n per_kernel --dispatch 2 +<...> +17. L2 Cache +17.2 L2 - Fabric Transactions +╒═════════╤═════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡ +│ 17.2.0 │ L2-Fabric Read BW │ 42947428672.00 │ 42947428672.00 │ 42947428672.00 │ Bytes per kernel │ +├─────────┼─────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.1 │ HBM Read Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ +├─────────┼─────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.2 │ Remote Read Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ +╘═════════╧═════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛ +17.4 L2 - Fabric Interface Stalls +╒═════════╤═══════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕ +│ Index │ Metric │ Type │ Transaction │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡ +│ 17.4.0 │ Read - PCIe Stall │ PCIe Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ +│ 17.4.1 │ Read - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ +│ 17.4.2 │ Read - HBM Stall │ HBM Stall │ Read │ 0.07 │ 0.07 │ 0.07 │ Pct │ +╘═════════╧═══════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛ +17.5 L2 - Fabric Detailed Transaction Breakdown +╒═════════╤═════════════════╤══════════════╤══════════════╤══════════════╤════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═════════════════╪══════════════╪══════════════╪══════════════╪════════════════╡ +│ 17.5.0 │ Read (32B) │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ +├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ +│ 17.5.1 │ Read (Uncached) │ 1450.00 │ 1450.00 │ 1450.00 │ Req per kernel │ +├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ +│ 17.5.2 │ Read (64B) │ 671053573.00 │ 671053573.00 │ 671053573.00 │ Req per kernel │ +├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ +│ 17.5.3 │ HBM Read │ 671053565.00 │ 671053565.00 │ 671053565.00 │ Req per kernel │ +├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ +│ 17.5.4 │ Remote Read │ 8.00 │ 8.00 │ 8.00 │ Req per kernel │ +╘═════════╧═════════════════╧══════════════╧══════════════╧══════════════╧════════════════╛ +``` + +Here, we see: + - The vast majority of L2-Fabric requests (>99%) are 64B read requests (17.5.2) + - Nearly 100% of the read requests (17.2.1) are homed in on the accelerator-local HBM (17.5.3), while some small fraction of these reads are routed to a "remote" device (17.5.4) + - These drive a $\sim40$GiB per kernel read-bandwidth (17.2.0) + +In addition, we see a small amount of [uncached](Mtype) reads (17.5.1), these correspond to things like: + - the assembly code to execute the kernel + - kernel arguments + - coordinate parameters (e.g., blockDim.z) that were not initialized by the hardware, etc. +and may account for some of our 'remote' read requests (17.5.4), e.g., reading from CPU DRAM. + +The above list is not exhaustive, nor are all of these guaranteed to be 'uncached' -- the exact implementation depends on the accelerator and ROCm versions used. +These read requests could be interrogated further in the [Scalar L1 Data Cache](sL1D) and [Instruction Cache](L1I) metric sections. + +```{note} +The Traffic metrics in Sec 17.2 are presented as a percentage of the total number of requests, e.g. 'HBM Read Traffic' is the percent of read requests (17.5.0-17.5.2) that were directed to the accelerators' local HBM (17.5.3). +``` + +(Fabric_exp_2)= +### Experiment #2 - Fine-grained, accelerator-local HBM reads + +In this experiment, we change the [granularity](Mtype) of our device-allocation to be fine-grained device memory, local to the current accelerator. +Our code uses the `hipExtMallocWithFlag` API with the `hipDeviceMallocFinegrained` flag to accomplish this. + +```{note} +On some systems (e.g., those with only PCIe(r) connected accelerators), you need to set the environment variable `HSA_FORCE_FINE_GRAIN_PCIE=1` to enable this memory type. +``` + +```shell-session +$ omniperf profile -n fine_grained_local --no-roof -- ./fabric -t 0 -o 0 +Using: + mtype:FineGrained + mowner:Device + mspace:Global + mop:Read + mdata:Unsigned + remoteId:-1 +<...> +$ omniperf analyze -p workloads/fine_grained_local/mi200 -b 17.2.0 17.2.1 17.2.2 17.2.3 17.4.0 17.4.1 17.4.2 17.5.0 17.5.1 17.5.2 17.5.3 17.5.4 -n per_kernel --dispatch 2 +<...> +17. L2 Cache +17.2 L2 - Fabric Transactions +╒═════════╤═══════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡ +│ 17.2.0 │ L2-Fabric Read BW │ 42948661824.00 │ 42948661824.00 │ 42948661824.00 │ Bytes per kernel │ +├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.1 │ HBM Read Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ +├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.2 │ Remote Read Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.3 │ Uncached Read Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ +╘═════════╧═══════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛ +17.4 L2 - Fabric Interface Stalls +╒═════════╤═══════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕ +│ Index │ Metric │ Type │ Transaction │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡ +│ 17.4.0 │ Read - PCIe Stall │ PCIe Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ +│ 17.4.1 │ Read - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ +│ 17.4.2 │ Read - HBM Stall │ HBM Stall │ Read │ 0.07 │ 0.07 │ 0.07 │ Pct │ +╘═════════╧═══════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛ +17.5 L2 - Fabric Detailed Transaction Breakdown +╒═════════╤═════════════════╤══════════════╤══════════════╤══════════════╤════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═════════════════╪══════════════╪══════════════╪══════════════╪════════════════╡ +│ 17.5.0 │ Read (32B) │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ +├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ +│ 17.5.1 │ Read (Uncached) │ 1334.00 │ 1334.00 │ 1334.00 │ Req per kernel │ +├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ +│ 17.5.2 │ Read (64B) │ 671072841.00 │ 671072841.00 │ 671072841.00 │ Req per kernel │ +├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ +│ 17.5.3 │ HBM Read │ 671072835.00 │ 671072835.00 │ 671072835.00 │ Req per kernel │ +├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ +│ 17.5.4 │ Remote Read │ 6.00 │ 6.00 │ 6.00 │ Req per kernel │ +╘═════════╧═════════════════╧══════════════╧══════════════╧══════════════╧════════════════╛ +``` + +Comparing with our [previous example](Fabric_exp_1), we see a relatively similar result, namely: + - The vast majority of L2-Fabric requests are 64B read requests (17.5.2) + - Nearly all these read requests are directed to the accelerator-local HBM (17.2.1) + +In addition, we now see a small percentage of HBM Read Stalls (17.4.2), as streaming fine-grained memory is putting more stress on Infinity Fabric(tm). + +```{note} +The stalls in Sec 17.4 are presented as a percentage of the total number active L2 cycles, summed over [all L2 channels](L2). +``` + +(Fabric_exp_3)= +### Experiment #3 - Fine-grained, remote-accelerator HBM reads + +In this experiment, we move our [fine-grained](Mtype) allocation to be owned by a remote accelerator. +We accomplish this by first changing the HIP device using e.g., `hipSetDevice(1)` API, then allocating fine-grained memory (as described [previously](Fabric_exp_2)), and finally resetting the device back to the default, e.g., `hipSetDevice(0)`. + +Although we have not changed our code significantly, we do see a substantial change in the L2-Fabric metrics: + +```shell-session +$ omniperf profile -n fine_grained_remote --no-roof -- ./fabric -t 0 -o 2 +Using: + mtype:FineGrained + mowner:Remote + mspace:Global + mop:Read + mdata:Unsigned + remoteId:-1 +<...> +$ omniperf analyze -p workloads/fine_grained_remote/mi200 -b 17.2.0 17.2.1 17.2.2 17.2.3 17.4.0 17.4.1 17.4.2 17.5.0 17.5.1 17.5.2 17.5.3 17.5.4 -n per_kernel --dispatch 2 +<...> +17. L2 Cache +17.2 L2 - Fabric Transactions +╒═════════╤═══════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡ +│ 17.2.0 │ L2-Fabric Read BW │ 42949692736.00 │ 42949692736.00 │ 42949692736.00 │ Bytes per kernel │ +├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.1 │ HBM Read Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.2 │ Remote Read Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ +├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.3 │ Uncached Read Traffic │ 200.00 │ 200.00 │ 200.00 │ Pct │ +╘═════════╧═══════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛ +17.4 L2 - Fabric Interface Stalls +╒═════════╤═══════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕ +│ Index │ Metric │ Type │ Transaction │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡ +│ 17.4.0 │ Read - PCIe Stall │ PCIe Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ +│ 17.4.1 │ Read - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Read │ 17.85 │ 17.85 │ 17.85 │ Pct │ +├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ +│ 17.4.2 │ Read - HBM Stall │ HBM Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ +╘═════════╧═══════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛ +17.5 L2 - Fabric Detailed Transaction Breakdown +╒═════════╤═════════════════╤═══════════════╤═══════════════╤═══════════════╤════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═════════════════╪═══════════════╪═══════════════╪═══════════════╪════════════════╡ +│ 17.5.0 │ Read (32B) │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ +├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤ +│ 17.5.1 │ Read (Uncached) │ 1342177894.00 │ 1342177894.00 │ 1342177894.00 │ Req per kernel │ +├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤ +│ 17.5.2 │ Read (64B) │ 671088949.00 │ 671088949.00 │ 671088949.00 │ Req per kernel │ +├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤ +│ 17.5.3 │ HBM Read │ 307.00 │ 307.00 │ 307.00 │ Req per kernel │ +├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤ +│ 17.5.4 │ Remote Read │ 671088642.00 │ 671088642.00 │ 671088642.00 │ Req per kernel │ +╘═════════╧═════════════════╧═══════════════╧═══════════════╧═══════════════╧════════════════╛ +``` + +First, we see that while we still observe approximately the same number of 64B Read Requests (17.5.2), we now see an even larger number of Uncached Read Requests (17.5.3). Some simple division reveals: +```math +342177894.00 / 671088949.00 ≈ 2 +``` +That is, each 64B Read Request is _also_ counted as two Uncached Read Requests, as reflected in the [request-flow diagram](fabric-fig). +This is also why the Uncached Read Traffic metric (17.2.3) is at the counter-intuitive value of 200%! + +In addition, we also observe that: + - we no longer see any significant number of HBM Read Requests (17.2.1, 17.5.3), nor HBM Read Stalls (17.4.2), but instead + - we observe that almost all of these requests are considered "remote" (17.2.2, 17.5.4) are being routed to another accelerator, or the CPU --- in this case HIP Device 1 --- and + - we observe a significantly larger percentage of AMD Infinity Fabric(tm) Read Stalls (17.4.1) as compared to the HBM Read Stalls in the [previous example](Fabric_exp_2) + +These stalls correspond to reads that are going out over the AMD Infinity Fabric(tm) connection to another MI250 accelerator. +In addition, because these are crossing between accelerators, we expect significantly lower achievable bandwidths as compared to the local accelerator's HBM -- this is reflected (indirectly) in the magnitude of the stall metric (17.4.1). +Finally, we note that if our system contained only PCIe(r) connected accelerators, these observations will differ. + +(Fabric_exp_4)= +### Experiment #4 - Fine-grained, CPU-DRAM reads + +In this experiment, we move our [fine-grained](Mtype) allocation to be owned by the CPU's DRAM. +We accomplish this by allocating host-pinned fine-grained memory using the `hipHostMalloc` API: + +```shell-session +$ omniperf profile -n fine_grained_host --no-roof -- ./fabric -t 0 -o 1 +Using: + mtype:FineGrained + mowner:Host + mspace:Global + mop:Read + mdata:Unsigned + remoteId:-1 +<...> +$ omniperf analyze -p workloads/fine_grained_host/mi200 -b 17.2.0 17.2.1 17.2.2 17.2.3 17.4.0 17.4.1 17.4.2 17.5.0 17.5.1 17.5.2 17.5.3 17.5.4 -n per_kernel --dispatch 2 +<...> +17. L2 Cache +17.2 L2 - Fabric Transactions +╒═════════╤═══════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡ +│ 17.2.0 │ L2-Fabric Read BW │ 42949691264.00 │ 42949691264.00 │ 42949691264.00 │ Bytes per kernel │ +├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.1 │ HBM Read Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.2 │ Remote Read Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ +├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.3 │ Uncached Read Traffic │ 200.00 │ 200.00 │ 200.00 │ Pct │ +╘═════════╧═══════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛ +17.4 L2 - Fabric Interface Stalls +╒═════════╤═══════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕ +│ Index │ Metric │ Type │ Transaction │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡ +│ 17.4.0 │ Read - PCIe Stall │ PCIe Stall │ Read │ 91.29 │ 91.29 │ 91.29 │ Pct │ +├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ +│ 17.4.1 │ Read - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ +│ 17.4.2 │ Read - HBM Stall │ HBM Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ +╘═════════╧═══════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛ +17.5 L2 - Fabric Detailed Transaction Breakdown +╒═════════╤═════════════════╤═══════════════╤═══════════════╤═══════════════╤════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═════════════════╪═══════════════╪═══════════════╪═══════════════╪════════════════╡ +│ 17.5.0 │ Read (32B) │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ +├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤ +│ 17.5.1 │ Read (Uncached) │ 1342177848.00 │ 1342177848.00 │ 1342177848.00 │ Req per kernel │ +├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤ +│ 17.5.2 │ Read (64B) │ 671088926.00 │ 671088926.00 │ 671088926.00 │ Req per kernel │ +├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤ +│ 17.5.3 │ HBM Read │ 284.00 │ 284.00 │ 284.00 │ Req per kernel │ +├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤ +│ 17.5.4 │ Remote Read │ 671088642.00 │ 671088642.00 │ 671088642.00 │ Req per kernel │ +╘═════════╧═════════════════╧═══════════════╧═══════════════╧═══════════════╧════════════════╛ +``` + +Here we see _almost_ the same results as in the [previous experiment](Fabric_exp_3), however now as we are crossing a PCIe(r) bus to the CPU, we see that the Infinity Fabric(tm) Read stalls (17.4.1) have shifted to be a PCIe(r) stall (17.4.2). +In addition, as (on this system) the PCIe(r) bus has a lower peak bandwidth than the AMD Infinity Fabric(TM) connection between two accelerators, we once again observe an increase in the percentage of stalls on this interface. + +```{note} +Had we performed this same experiment on a [MI250X system](https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf), these transactions would again have been marked as Infinity Fabric(tm) Read stalls (17.4.1), as the CPU is connected to the accelerator via AMD Infinity Fabric. +``` + +(Fabric_exp_5)= +### Experiment #5 - Coarse-grained, CPU-DRAM reads + +In our next fabric experiment, we change our CPU memory allocation to be [coarse-grained](Mtype). +We accomplish this by passing the `hipHostMalloc` API the `hipHostMallocNonCoherent` flag, to mark the allocation as coarse-grained: + +```shell-session +$ omniperf profile -n coarse_grained_host --no-roof -- ./fabric -t 1 -o 1 +Using: + mtype:CoarseGrained + mowner:Host + mspace:Global + mop:Read + mdata:Unsigned + remoteId:-1 +<...> +$ omniperf analyze -p workloads/coarse_grained_host/mi200 -b 17.2.0 17.2.1 17.2.2 17.2.3 17.4.0 17.4.1 17.4.2 17.5.0 17.5.1 17.5.2 17.5.3 17.5.4 -n per_kernel --dispatch 2 +<...> +17. L2 Cache +17.2 L2 - Fabric Transactions +╒═════════╤═══════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡ +│ 17.2.0 │ L2-Fabric Read BW │ 42949691264.00 │ 42949691264.00 │ 42949691264.00 │ Bytes per kernel │ +├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.1 │ HBM Read Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.2 │ Remote Read Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ +├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.3 │ Uncached Read Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ +╘═════════╧═══════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛ +17.4 L2 - Fabric Interface Stalls +╒═════════╤═══════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕ +│ Index │ Metric │ Type │ Transaction │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡ +│ 17.4.0 │ Read - PCIe Stall │ PCIe Stall │ Read │ 91.27 │ 91.27 │ 91.27 │ Pct │ +├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ +│ 17.4.1 │ Read - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ +│ 17.4.2 │ Read - HBM Stall │ HBM Stall │ Read │ 0.00 │ 0.00 │ 0.00 │ Pct │ +╘═════════╧═══════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛ +17.5 L2 - Fabric Detailed Transaction Breakdown +╒═════════╤═════════════════╤══════════════╤══════════════╤══════════════╤════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═════════════════╪══════════════╪══════════════╪══════════════╪════════════════╡ +│ 17.5.0 │ Read (32B) │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ +├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ +│ 17.5.1 │ Read (Uncached) │ 562.00 │ 562.00 │ 562.00 │ Req per kernel │ +├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ +│ 17.5.2 │ Read (64B) │ 671088926.00 │ 671088926.00 │ 671088926.00 │ Req per kernel │ +├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ +│ 17.5.3 │ HBM Read │ 281.00 │ 281.00 │ 281.00 │ Req per kernel │ +├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ +│ 17.5.4 │ Remote Read │ 671088645.00 │ 671088645.00 │ 671088645.00 │ Req per kernel │ +╘═════════╧═════════════════╧══════════════╧══════════════╧══════════════╧════════════════╛ +``` + +Here we see a similar result to our [previous experiment](Fabric_exp_4), with one key difference: our accesses are no longer marked as Uncached Read requests (17.2.3, 17.5.1), but instead are 64B read requests (17.5.2), as observed in our [Coarse-grained, accelerator-local HBM](Fabric_exp_1) experiment. + +(Fabric_exp_6)= +### Experiment #6 - Fine-grained, CPU-DRAM writes + +Thus far in our exploration of the L2-Fabric interface, we have primarily focused on read operations. +However, in [our request flow diagram](fabric-fig), we note that writes are counted separately. +To obeserve this, we use the '-p' flag to trigger write operations to fine-grained memory allocated on the host: + +```shell-session +$ omniperf profile -n fine_grained_host_write --no-roof -- ./fabric -t 0 -o 1 -p 1 +Using: + mtype:FineGrained + mowner:Host + mspace:Global + mop:Write + mdata:Unsigned + remoteId:-1 +<...> +$ omniperf analyze -p workloads/fine_grained_host_writes/mi200 -b 17.2.4 17.2.5 17.2.6 17.2.7 17.2.8 17.4.3 17.4.4 17.4.5 17.4.6 17.5.5 17.5.6 17.5.7 17.5.8 17.5.9 17.5.10 -n per_kernel --dispatch 2 +<...> +17. L2 Cache +17.2 L2 - Fabric Transactions +╒═════════╤═══════════════════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡ +│ 17.2.4 │ L2-Fabric Write and Atomic BW │ 42949672960.00 │ 42949672960.00 │ 42949672960.00 │ Bytes per kernel │ +├─────────┼───────────────────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.5 │ HBM Write and Atomic Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼───────────────────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.6 │ Remote Write and Atomic Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ +├─────────┼───────────────────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.7 │ Atomic Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼───────────────────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤ +│ 17.2.8 │ Uncached Write and Atomic Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ +╘═════════╧═══════════════════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛ +17.4 L2 - Fabric Interface Stalls +╒═════════╤════════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕ +│ Index │ Metric │ Type │ Transaction │ Avg │ Min │ Max │ Unit │ +╞═════════╪════════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡ +│ 17.4.3 │ Write - PCIe Stall │ PCIe Stall │ Write │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ +│ 17.4.4 │ Write - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Write │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ +│ 17.4.5 │ Write - HBM Stall │ HBM Stall │ Write │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ +│ 17.4.6 │ Write - Credit Starvation │ Credit Starvation │ Write │ 0.00 │ 0.00 │ 0.00 │ Pct │ +╘═════════╧════════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛ +17.5 L2 - Fabric Detailed Transaction Breakdown +╒═════════╤═════════════════════════╤══════════════╤══════════════╤══════════════╤════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═════════════════════════╪══════════════╪══════════════╪══════════════╪════════════════╡ +│ 17.5.5 │ Write (32B) │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ +├─────────┼─────────────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ +│ 17.5.6 │ Write (Uncached) │ 671088640.00 │ 671088640.00 │ 671088640.00 │ Req per kernel │ +├─────────┼─────────────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ +│ 17.5.7 │ Write (64B) │ 671088640.00 │ 671088640.00 │ 671088640.00 │ Req per kernel │ +├─────────┼─────────────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ +│ 17.5.8 │ HBM Write and Atomic │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ +├─────────┼─────────────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ +│ 17.5.9 │ Remote Write and Atomic │ 671088640.00 │ 671088640.00 │ 671088640.00 │ Req per kernel │ +├─────────┼─────────────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤ +│ 17.5.10 │ Atomic │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ +╘═════════╧═════════════════════════╧══════════════╧══════════════╧══════════════╧════════════════╛ +``` + +Here we notice a few changes in our request pattern: + - As expected, the requests have changed from 64B Reads to 64B Write requests (17.5.7), + - these requests are homed in on a "remote" destination (17.2.6, 17.5.9), as expected, and, + - these are also counted as a single Uncached Write request (17.5.6). + +In addition, there rather significant changes in the bandwidth values reported: + - the "L2-Fabric Write and Atomic" bandwidth metric (17.2.4) reports about 40GiB of data written across Infinity Fabric(tm) while, + - the "Remote Write and Traffic" metric (17.2.5) indicates that nearly 100% of these request are being directed to a remote source + +The precise meaning of these metrics will be explored in the [subsequent experiment](Fabric_exp_7). + +Finally, we note that we see no write stalls on the PCIe(r) bus (17.4.3). This is because writes over a PCIe(r) bus [are non-posted](https://members.pcisig.com/wg/PCI-SIG/document/10912), i.e., they do not require acknowledgement. + +(Fabric_exp_7)= +### Experiment #7 - Fine-grained, CPU-DRAM atomicAdd + +Next, we change our experiment to instead target `atomicAdd` operations to the CPU's DRAM. + +```shell-session +$ omniperf profile -n fine_grained_host_add --no-roof -- ./fabric -t 0 -o 1 -p 2 +Using: + mtype:FineGrained + mowner:Host + mspace:Global + mop:Add + mdata:Unsigned + remoteId:-1 +<...> +$ omniperf analyze -p workloads/fine_grained_host_add/mi200 -b 17.2.4 17.2.5 17.2.6 17.2.7 17.2.8 17.4.3 17.4.4 17.4.5 17.4.6 17.5.5 17.5.6 17.5.7 17.5.8 17.5.9 17.5.10 -n per_kernel --dispatch 2 +<...> +17. L2 Cache +17.2 L2 - Fabric Transactions +╒═════════╤═══════════════════════════════════╤══════════════╤══════════════╤══════════════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════════════════╪══════════════╪══════════════╪══════════════╪══════════════════╡ +│ 17.2.4 │ L2-Fabric Write and Atomic BW │ 429496736.00 │ 429496736.00 │ 429496736.00 │ Bytes per kernel │ +├─────────┼───────────────────────────────────┼──────────────┼──────────────┼──────────────┼──────────────────┤ +│ 17.2.5 │ HBM Write and Atomic Traffic │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼───────────────────────────────────┼──────────────┼──────────────┼──────────────┼──────────────────┤ +│ 17.2.6 │ Remote Write and Atomic Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ +├─────────┼───────────────────────────────────┼──────────────┼──────────────┼──────────────┼──────────────────┤ +│ 17.2.7 │ Atomic Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ +├─────────┼───────────────────────────────────┼──────────────┼──────────────┼──────────────┼──────────────────┤ +│ 17.2.8 │ Uncached Write and Atomic Traffic │ 100.00 │ 100.00 │ 100.00 │ Pct │ +╘═════════╧═══════════════════════════════════╧══════════════╧══════════════╧══════════════╧══════════════════╛ +17.4 L2 - Fabric Interface Stalls +╒═════════╤════════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕ +│ Index │ Metric │ Type │ Transaction │ Avg │ Min │ Max │ Unit │ +╞═════════╪════════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡ +│ 17.4.3 │ Write - PCIe Stall │ PCIe Stall │ Write │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ +│ 17.4.4 │ Write - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Write │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ +│ 17.4.5 │ Write - HBM Stall │ HBM Stall │ Write │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤ +│ 17.4.6 │ Write - Credit Starvation │ Credit Starvation │ Write │ 0.00 │ 0.00 │ 0.00 │ Pct │ +╘═════════╧════════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛ +17.5 L2 - Fabric Detailed Transaction Breakdown +╒═════════╤═════════════════════════╤═════════════╤═════════════╤═════════════╤════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═════════════════════════╪═════════════╪═════════════╪═════════════╪════════════════╡ +│ 17.5.5 │ Write (32B) │ 13421773.00 │ 13421773.00 │ 13421773.00 │ Req per kernel │ +├─────────┼─────────────────────────┼─────────────┼─────────────┼─────────────┼────────────────┤ +│ 17.5.6 │ Write (Uncached) │ 13421773.00 │ 13421773.00 │ 13421773.00 │ Req per kernel │ +├─────────┼─────────────────────────┼─────────────┼─────────────┼─────────────┼────────────────┤ +│ 17.5.7 │ Write (64B) │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ +├─────────┼─────────────────────────┼─────────────┼─────────────┼─────────────┼────────────────┤ +│ 17.5.8 │ HBM Write and Atomic │ 0.00 │ 0.00 │ 0.00 │ Req per kernel │ +├─────────┼─────────────────────────┼─────────────┼─────────────┼─────────────┼────────────────┤ +│ 17.5.9 │ Remote Write and Atomic │ 13421773.00 │ 13421773.00 │ 13421773.00 │ Req per kernel │ +├─────────┼─────────────────────────┼─────────────┼─────────────┼─────────────┼────────────────┤ +│ 17.5.10 │ Atomic │ 13421773.00 │ 13421773.00 │ 13421773.00 │ Req per kernel │ +╘═════════╧═════════════════════════╧═════════════╧═════════════╧═════════════╧════════════════╛ +``` + +In this case, there is quite a lot to unpack: + - For the first time, the 32B Write requests (17.5.5) are heavily used. + - These correspond to Atomic requests (17.2.7, 17.5.10), and are counted as Uncached Writes (17.5.6). + - The L2-Fabric Write and Atomic bandwidth metric (17.2.4) shows about 0.4 GiB of traffic. For convenience, the sample reduces the default problem size for this case due to the speed of atomics across a PCIe(r) bus, and finally, + - The traffic is directed to a remote device (17.2.6, 17.5.9) + +Let us consider what an "atomic" request means in this context. +Recall that we are discussing memory traffic flowing from the L2 cache, the device-wide coherence point on current CDNA accelerators such as the MI250, to e.g., the CPU's DRAM. +In this light, we see that these requests correspond to _system scope_ atomics, and specifically in the case of the MI250, to fine-grained memory! + + + +## Vector memory operation counting + +(flatmembench)= +### Global / Generic (FLAT) + +For this example, we consider the [vector-memory sample](https://github.com/AMDResearch/omniperf/blob/dev/sample/vmem.hip) distributed as a part of Omniperf. +This code launches many different versions of a simple read/write/atomic-only kernels targeting various address spaces, e.g. below is our simple `global_write` kernel: + +```c++ +// write to a global pointer +__global__ void global_write(int* ptr, int zero) { + ptr[threadIdx.x] = zero; +} +``` + +This example was compiled and run on an MI250 accelerator using ROCm v5.6.0, and Omniperf v2.0.0. +```shell-session +$ hipcc -O3 --save-temps vmem.hip -o vmem +``` +We have also chosen to include the `--save-temps` flag to save the compiler temporary files, such as the generated CDNA assembly code, for inspection. + +Finally, we generate our omniperf profile as: +```shell-session +$ omniperf profile -n vmem --no-roof -- ./vmem +``` + +(Flat_design)= +#### Design note + +We should explain some of the more peculiar line(s) of code in our example, e.g., the use of compiler builtins and explicit address space casting, etc. +```c++ +// write to a generic pointer +typedef int __attribute__((address_space(0)))* generic_ptr; + +__attribute__((noinline)) __device__ void generic_store(generic_ptr ptr, int zero) { *ptr = zero; } + +__global__ void generic_write(int* ptr, int zero, int filter) { + __shared__ int lds[1024]; + int* generic = (threadIdx.x < filter) ? &ptr[threadIdx.x] : &lds[threadIdx.x]; + generic_store((generic_ptr)generic, zero); +} +``` + +One of our aims in this example is to demonstrate the use of the ['generic' (a.k.a., FLAT)](https://llvm.org/docs/AMDGPUUsage.html#address-space-identifier) address space. +This address space is typically used when the compiler cannot statically prove where the backing memory is located. + +To try to _force_ the compiler to use this address space, we have applied `__attribute__((noinline))` to the `generic_store` function to have the compiler treat it as a function call (i.e., on the other-side of which, the address space may not be known). +However, in a trivial example such as this, the compiler may choose to specialize the `generic_store` function to the two address spaces that may provably be used from our translation-unit, i.e., ['local' (a.k.a., LDS)](Mspace) and ['global'](Mspace). Hence, we forcibly cast the address space to ['generic' (i.e., FLAT)](Mspace) to avoid this compiler optimization. + +```{warning} +While convenient for our example here, this sort of explicit address space casting can lead to strange compilation errors, and in the worst cases, incorrect results and thus use is discouraged in production code. +``` + +For more details on address spaces, the reader is referred to the [address-space section](Mspace). + +#### Global Write + +First, we demonstrate our simple `global_write` kernel: +```shell-session +$ omniperf analyze -p workloads/vmem/mi200/ --dispatch 1 -b 10.3 15.1.4 15.1.5 15.1.6 15.1.7 15.1.8 15.1.9 15.1.10 15.1.11 -n per_kernel +<...> +-------------------------------------------------------------------------------- +0. Top Stat +╒════╤═════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕ +│ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ +╞════╪═════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡ +│ 0 │ global_write(int*, int) [clone .kd] │ 1.00 │ 2400.00 │ 2400.00 │ 2400.00 │ 100.00 │ +╘════╧═════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛ + + +-------------------------------------------------------------------------------- +10. Compute Units - Instruction Mix +10.3 VMEM Instr Mix +╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡ +│ 10.3.0 │ Global/Generic Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.1 │ Global/Generic Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.2 │ Global/Generic Write │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.3 │ Global/Generic Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.4 │ Spill/Stack Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.5 │ Spill/Stack Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.6 │ Spill/Stack Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.7 │ Spill/Stack Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛ + + +-------------------------------------------------------------------------------- +15. Address Processing Unit and Data Return Path (TA/TD) +15.1 Address Processing Unit +╒═════════╤═════════════════════════════╤═══════╤═══════╤═══════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═════════════════════════════╪═══════╪═══════╪═══════╪══════════════════╡ +│ 15.1.4 │ Total Instructions │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 15.1.5 │ Global/Generic Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 15.1.6 │ Global/Generic Read Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 15.1.7 │ Global/Generic Write Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 15.1.8 │ Global/Generic Atomic Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 15.1.9 │ Spill/Stack Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 15.1.10 │ Spill/Stack Read Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 15.1.11 │ Spill/Stack Write Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +╘═════════╧═════════════════════════════╧═══════╧═══════╧═══════╧══════════════════╛ +``` + +Here, we have presented both the information in the VMEM Instruction Mix table (10.3) and the Address Processing Unit (15.1). +We note that this data is expected to be identical, and hence we omit table 15.1 in our subsequent examples. + +In addition, as expected, we see a single Global/Generic write instruction (10.3.2, 15.1.7). +Inspecting the generated assembly: + +```asm + .protected _Z12global_writePii ; -- Begin function _Z12global_writePii + .globl _Z12global_writePii + .p2align 8 + .type _Z12global_writePii,@function +_Z12global_writePii: ; @_Z12global_writePii +; %bb.0: + s_load_dword s2, s[4:5], 0x8 + s_load_dwordx2 s[0:1], s[4:5], 0x0 + v_lshlrev_b32_e32 v0, 2, v0 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v1, s2 + global_store_dword v0, v1, s[0:1] + s_endpgm + .section .rodata,#alloc + .p2align 6, 0x0 + .amdhsa_kernel _Z12global_writePii +``` + +we see that this corresponds to an instance of a `global_store_dword` operation. + +```{note} +The assembly in these experiments were generated for an [MI2XX](2xxnote) accelerator using ROCm 5.6.0, and may change depending on ROCm versions and the targeted hardware architecture +``` + +(Generic_write)= +#### Generic Write to LDS + +Next, we examine a generic write. +As discussed [previously](Flat_design), our `generic_write` kernel uses an address space cast to _force_ the compiler to choose our desired address space, regardless of other optimizations that may be possible. + +We also note that the `filter` parameter passed in as a kernel argument (see [example](https://github.com/AMDResearch/omniperf/blob/dev/sample/vmem.hip), or [design note](Flat_design)) is set to zero on the host, such that we always write to the 'local' (LDS) memory allocation `lds`. + +Examining this kernel in the VMEM Instruction Mix table yields: + +```shell-session +$ omniperf analyze -p workloads/vmem/mi200/ --dispatch 2 -b 10.3 -n per_kernel +<...> +0. Top Stat +╒════╤══════════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕ +│ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ +╞════╪══════════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡ +│ 0 │ generic_write(int*, int, int) [clone .kd │ 1.00 │ 2880.00 │ 2880.00 │ 2880.00 │ 100.00 │ +│ │ ] │ │ │ │ │ │ +╘════╧══════════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛ + + +-------------------------------------------------------------------------------- +10. Compute Units - Instruction Mix +10.3 VMEM Instr Mix +╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡ +│ 10.3.0 │ Global/Generic Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.1 │ Global/Generic Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.2 │ Global/Generic Write │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.3 │ Global/Generic Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.4 │ Spill/Stack Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.5 │ Spill/Stack Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.6 │ Spill/Stack Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.7 │ Spill/Stack Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛ +``` + +As expected we see a single generic write (10.3.2). +In the assembly generated for this kernel (in particular, we care about the `generic_store` function). We see that this corresponds to a `flat_store_dword` instruction: + +```asm + .type _Z13generic_storePii,@function +_Z13generic_storePii: ; @_Z13generic_storePii +; %bb.0: + s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + flat_store_dword v[0:1], v2 + s_waitcnt vmcnt(0) lgkmcnt(0) + s_setpc_b64 s[30:31] +.Lfunc_end0: +``` + +In addition, we note that we can observe the destination of this request by looking at the LDS Instructions metric (12.2.0): +```shell-session +$ omniperf analyze -p workloads/vmem/mi200/ --dispatch 2 -b 12.2.0 -n per_kernel +<...> +12. Local Data Share (LDS) +12.2 LDS Stats +╒═════════╤════════════╤═══════╤═══════╤═══════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪════════════╪═══════╪═══════╪═══════╪══════════════════╡ +│ 12.2.0 │ LDS Instrs │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +╘═════════╧════════════╧═══════╧═══════╧═══════╧══════════════════╛ +``` +which indicates one LDS access. + +```{note} +Exercise for the reader: if this access had been targeted at global memory (e.g., by changing value of `filter`), where should we look for the memory traffic? Hint: see our [generic read](Generic_read) example. +``` + +#### Global read + +Next, we examine a simple global read operation: + +```c++ +__global__ void global_read(int* ptr, int zero) { + int x = ptr[threadIdx.x]; + if (x != zero) { + ptr[threadIdx.x] = x + 1; + } +} +``` + +Here we observe a now familiar pattern: + - Read a value in from global memory + - Have a write hidden behind a conditional that is impossible for the compiler to statically eliminate, but is identically false. In this case, our `main()` function initializes the data in `ptr` to zero. + +Running Omniperf on this kernel yields: + +```shell-session +$ omniperf analyze -p workloads/vmem/mi200/ --dispatch 3 -b 10.3 -n per_kernel +<...> +0. Top Stat +╒════╤════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕ +│ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ +╞════╪════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡ +│ 0 │ global_read(int*, int) [clone .kd] │ 1.00 │ 4480.00 │ 4480.00 │ 4480.00 │ 100.00 │ +╘════╧════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛ + + +-------------------------------------------------------------------------------- +10. Compute Units - Instruction Mix +10.3 VMEM Instr Mix +╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡ +│ 10.3.0 │ Global/Generic Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.1 │ Global/Generic Read │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.2 │ Global/Generic Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.3 │ Global/Generic Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.4 │ Spill/Stack Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.5 │ Spill/Stack Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.6 │ Spill/Stack Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.7 │ Spill/Stack Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛ +``` + +Here we see a single global/generic instruction (10.3.0) which, as expected, is a read (10.3.1). + +(Generic_read)= +#### Generic read from global memory + +For our generic read example, we choose to change our target for the generic read to be global memory: +```c++ +__global__ void generic_read(int* ptr, int zero, int filter) { + __shared__ int lds[1024]; + if (static_cast(filter - 1) == zero) { + lds[threadIdx.x] = 0; // initialize to zero to avoid conditional, but hide behind _another_ conditional + } + int* generic; + if (static_cast(threadIdx.x) > filter - 1) { + generic = &ptr[threadIdx.x]; + } else { + generic = &lds[threadIdx.x]; + abort(); + } + int x = generic_load((generic_ptr)generic); + if (x != zero) { + ptr[threadIdx.x] = x + 1; + } +} +``` + +In addition to our usual `if (condition_that_wont_happen)` guard around the write operation, there is an additional conditional around the initialization of the `lds` buffer. +We note that it's typically required to write to this buffer to prevent the compiler from eliminating the local memory branch entirely due to undefined behavior (use of an uninitialized value). +However, to report _only_ our global memory read, we again hide this initialization behind an identically false conditional (both `zero` and `filter` are set to zero in the kernel launch). Note that this is a _different_ conditional from our pointer assignment (to avoid combination of the two). + +Running Omniperf on this kernel reports: +```shell-session +$ omniperf analyze -p workloads/vmem/mi200/ --dispatch 4 -b 10.3 12.2.0 16.3.10 -n per_kernel +<...> +0. Top Stat +╒════╤══════════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕ +│ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ +╞════╪══════════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡ +│ 0 │ generic_read(int*, int, int) [clone .kd] │ 1.00 │ 2240.00 │ 2240.00 │ 2240.00 │ 100.00 │ +╘════╧══════════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛ + + +-------------------------------------------------------------------------------- +10. Compute Units - Instruction Mix +10.3 VMEM Instr Mix +╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡ +│ 10.3.0 │ Global/Generic Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.1 │ Global/Generic Read │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.2 │ Global/Generic Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.3 │ Global/Generic Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.4 │ Spill/Stack Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.5 │ Spill/Stack Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.6 │ Spill/Stack Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.7 │ Spill/Stack Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛ + + +-------------------------------------------------------------------------------- +12. Local Data Share (LDS) +12.2 LDS Stats +╒═════════╤════════════╤═══════╤═══════╤═══════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪════════════╪═══════╪═══════╪═══════╪══════════════════╡ +│ 12.2.0 │ LDS Instrs │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +╘═════════╧════════════╧═══════╧═══════╧═══════╧══════════════════╛ + + +-------------------------------------------------------------------------------- +16. Vector L1 Data Cache +16.3 L1D Cache Accesses +╒═════════╤════════════╤═══════╤═══════╤═══════╤════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪════════════╪═══════╪═══════╪═══════╪════════════════╡ +│ 16.3.10 │ L1-L2 Read │ 1.00 │ 1.00 │ 1.00 │ Req per kernel │ +╘═════════╧════════════╧═══════╧═══════╧═══════╧════════════════╛ +``` + +Here we observe: + - A single global/generic read operation (10.3.1), which + - Is not an LDS instruction (12.2), as seen in our [generic write](Generic_write) example, but is instead + - An L1-L2 read operation (16.3.10) + +That is, we have successfully targeted our generic read at global memory. +Inspecting the assembly shows this corresponds to a `flat_load_dword` instruction. + +(Global_atomic)= +#### Global atomic + +Our global atomic kernel: +```c++ +__global__ void global_atomic(int* ptr, int zero) { + atomicAdd(ptr, zero); +} +``` +simply atomically adds a (non-compile-time) zero value to a pointer. + +Running Omniperf on this kernel yields: +```shell-session +$ omniperf analyze -p workloads/vmem/mi200/ --dispatch 5 -b 10.3 16.3.12 -n per_kernel +<...> +0. Top Stat +╒════╤══════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕ +│ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ +╞════╪══════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡ +│ 0 │ global_atomic(int*, int) [clone .kd] │ 1.00 │ 4640.00 │ 4640.00 │ 4640.00 │ 100.00 │ +╘════╧══════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛ + + +-------------------------------------------------------------------------------- +10. Compute Units - Instruction Mix +10.3 VMEM Instr Mix +╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡ +│ 10.3.0 │ Global/Generic Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.1 │ Global/Generic Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.2 │ Global/Generic Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.3 │ Global/Generic Atomic │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.4 │ Spill/Stack Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.5 │ Spill/Stack Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.6 │ Spill/Stack Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.7 │ Spill/Stack Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛ + + +-------------------------------------------------------------------------------- +16. Vector L1 Data Cache +16.3 L1D Cache Accesses +╒═════════╤══════════════╤═══════╤═══════╤═══════╤════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪══════════════╪═══════╪═══════╪═══════╪════════════════╡ +│ 16.3.12 │ L1-L2 Atomic │ 1.00 │ 1.00 │ 1.00 │ Req per kernel │ +╘═════════╧══════════════╧═══════╧═══════╧═══════╧════════════════╛ +``` + +Here we see a single global/generic atomic instruction (10.3.3), which corresponds to an L1-L2 atomic request (16.3.12). + +(Generic_atomic)= +#### Generic, mixed atomic + +In our final global/generic example, we look at a case where our generic operation targets both LDS and global memory: +```c++ +__global__ void generic_atomic(int* ptr, int filter, int zero) { + __shared__ int lds[1024]; + int* generic = (threadIdx.x % 2 == filter) ? &ptr[threadIdx.x] : &lds[threadIdx.x]; + generic_atomic((generic_ptr)generic, zero); +} +``` + +This assigns every other work-item to atomically update global memory or local memory. + +Running this kernel through Omniperf shows: +```shell-session +$ omniperf analyze -p workloads/vmem/mi200/ --dispatch 6 -b 10.3 12.2.0 16.3.12 -n per_kernel +<...> +0. Top Stat +╒════╤══════════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕ +│ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ +╞════╪══════════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡ +│ 0 │ generic_atomic(int*, int, int) [clone .k │ 1.00 │ 3360.00 │ 3360.00 │ 3360.00 │ 100.00 │ +│ │ d] │ │ │ │ │ │ +╘════╧══════════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛ + + +10. Compute Units - Instruction Mix +10.3 VMEM Instr Mix +╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡ +│ 10.3.0 │ Global/Generic Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.1 │ Global/Generic Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.2 │ Global/Generic Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.3 │ Global/Generic Atomic │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.4 │ Spill/Stack Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.5 │ Spill/Stack Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.6 │ Spill/Stack Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.7 │ Spill/Stack Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛ + + +-------------------------------------------------------------------------------- +12. Local Data Share (LDS) +12.2 LDS Stats +╒═════════╤════════════╤═══════╤═══════╤═══════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪════════════╪═══════╪═══════╪═══════╪══════════════════╡ +│ 12.2.0 │ LDS Instrs │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +╘═════════╧════════════╧═══════╧═══════╧═══════╧══════════════════╛ + + +-------------------------------------------------------------------------------- +16. Vector L1 Data Cache +16.3 L1D Cache Accesses +╒═════════╤══════════════╤═══════╤═══════╤═══════╤════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪══════════════╪═══════╪═══════╪═══════╪════════════════╡ +│ 16.3.12 │ L1-L2 Atomic │ 1.00 │ 1.00 │ 1.00 │ Req per kernel │ +╘═════════╧══════════════╧═══════╧═══════╧═══════╧════════════════╛ +``` + +That is, we see: + - A single generic atomic instruction (10.3.3) that maps to both + - an LDS instruction (12.2.0), and + - an L1-L2 atomic request (16.3) + +We have demonstrated the ability of the generic address space to _dynamically_ target different backing memory! + +(buffermembench)= +### Spill/Scratch (BUFFER) + +Next we examine the use of 'Spill/Scratch' memory. +On current CDNA accelerators such as the [MI2XX](2xxnote), this is implemented using the [private](mspace) memory space, which maps to ['scratch' memory](https://llvm.org/docs/AMDGPUUsage.html#amdgpu-address-spaces) in AMDGPU hardware terminology. +This type of memory can be accessed via different instructions depending on the specific architecture targeted. However, current CDNA accelerators such as the [MI2XX](2xxnote) use so called `buffer` instructions to access private memory in a simple (and typically) coalesced manner. See [Sec. 9.1, 'Vector Memory Buffer Instructions' of the CDNA2 ISA guide](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf) for further reading on this instruction type. + +We develop a [simple kernel](https://github.com/AMDResearch/omniperf/blob/dev/sample/stack.hip) that uses stack memory: +```c++ +#include +__global__ void knl(int* out, int filter) { + int x[1024]; + x[filter] = 0; + if (threadIdx.x < filter) + out[threadIdx.x] = x[threadIdx.x]; +} +``` + +Our strategy here is to: + - Create a large stack buffer (that cannot reasonably fit into registers) + - Write to a compile-time unknown location on the stack, and then + - Behind the typical compile-time unknown `if(condition_that_wont_happen)` + - Read from a different, compile-time unknown, location on the stack and write to global memory to prevent the compiler from optimizing it out. + +This example was compiled and run on an MI250 accelerator using ROCm v5.6.0, and Omniperf v2.0.0. +```shell-session +$ hipcc -O3 stack.hip -o stack.hip +``` +and profiled using omniperf: +```shell-session +$ omniperf profile -n stack --no-roof -- ./stack +<...> +$ omniperf analyze -p workloads/stack/mi200/ -b 10.3 16.3.11 -n per_kernel +<...> +10. Compute Units - Instruction Mix +10.3 VMEM Instr Mix +╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡ +│ 10.3.0 │ Global/Generic Instr │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.1 │ Global/Generic Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.2 │ Global/Generic Write │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.3 │ Global/Generic Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.4 │ Spill/Stack Instr │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.5 │ Spill/Stack Read │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.6 │ Spill/Stack Write │ 1.00 │ 1.00 │ 1.00 │ Instr per kernel │ +├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤ +│ 10.3.7 │ Spill/Stack Atomic │ 0.00 │ 0.00 │ 0.00 │ Instr per kernel │ +╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛ + + +-------------------------------------------------------------------------------- +16. Vector L1 Data Cache +16.3 L1D Cache Accesses +╒═════════╤═════════════╤═══════╤═══════╤═══════╤════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═════════════╪═══════╪═══════╪═══════╪════════════════╡ +│ 16.3.11 │ L1-L2 Write │ 1.00 │ 1.00 │ 1.00 │ Req per kernel │ +╘═════════╧═════════════╧═══════╧═══════╧═══════╧════════════════╛ +``` + +Here we see a single write to the stack (10.3.6), which corresponds to an L1-L2 write request (16.3.11), i.e., the stack is backed by global memory and travels through the same memory hierarchy. + +(IPC_example)= +## Instructions-per-cycle and Utilizations example + +For this section, we use the instructions-per-cycle (IPC) [example](https://github.com/AMDResearch/omniperf/blob/dev/sample/ipc.hip) included with Omniperf. + +This example is compiled using `c++17` support: + +```shell-session +$ hipcc -O3 ipc.hip -o ipc -std=c++17 +``` + +and was run on an MI250 CDNA2 accelerator: + +```shell-session +$ omniperf profile -n ipc --no-roof -- ./ipc +``` + +The results shown in this section are _generally_ applicable to CDNA accelerators, but may vary between generations and specific products. + +### Design note + +The kernels in this example all execute a specific assembly operation `N` times (1000, by default), for instance the `vmov` kernel: + +```c++ +template +__device__ void vmov_op() { + int dummy; + if constexpr (N >= 1) { + asm volatile("v_mov_b32 v0, v1\n" : : "{v31}"(dummy)); + vmov_op(); + } +} + +template +__global__ void vmov() { + vmov_op(); +} +``` + +The kernels are then launched twice, once for a warm-up run, and once for measurement. + +(VALU_ipc)= +### VALU Utilization and IPC + +Now we can use our test to measure the achieved instructions-per-cycle of various types of instructions. +We start with a simple [VALU](valu) operation, i.e., a `v_mov_b32` instruction, e.g.: + +```asm +v_mov_b32 v0, v1 +``` + +This instruction simply copies the contents from the source register (`v1`) to the destination register (`v0`). +Investigating this kernel with Omniperf, we see: + +```shell-session +$ omniperf analyze -p workloads/ipc/mi200/ --dispatch 7 -b 11.2 +<...> +-------------------------------------------------------------------------------- +0. Top Stat +╒════╤═══════════════════════════════╤═════════╤═════════════╤═════════════╤══════════════╤════════╕ +│ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ +╞════╪═══════════════════════════════╪═════════╪═════════════╪═════════════╪══════════════╪════════╡ +│ 0 │ void vmov<1000>() [clone .kd] │ 1.00 │ 99317423.00 │ 99317423.00 │ 99317423.00 │ 100.00 │ +╘════╧═══════════════════════════════╧═════════╧═════════════╧═════════════╧══════════════╧════════╛ + + +-------------------------------------------------------------------------------- +11. Compute Units - Compute Pipeline +11.2 Pipeline Stats +╒═════════╤═════════════════════╤═══════╤═══════╤═══════╤══════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═════════════════════╪═══════╪═══════╪═══════╪══════════════╡ +│ 11.2.0 │ IPC │ 1.0 │ 1.0 │ 1.0 │ Instr/cycle │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.1 │ IPC (Issued) │ 1.0 │ 1.0 │ 1.0 │ Instr/cycle │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.2 │ SALU Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.3 │ VALU Util │ 99.98 │ 99.98 │ 99.98 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.4 │ VMEM Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.5 │ Branch Util │ 0.1 │ 0.1 │ 0.1 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.6 │ VALU Active Threads │ 64.0 │ 64.0 │ 64.0 │ Threads │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.7 │ MFMA Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.8 │ MFMA Instr Cycles │ │ │ │ Cycles/instr │ +╘═════════╧═════════════════════╧═══════╧═══════╧═══════╧══════════════╛ +``` + +Here we see that: + + 1. Both the IPC (11.2.0) and "Issued" IPC (11.2.1) metrics are $\sim 1$ + 2. The VALU Utilization metric (11.2.3) is also $\sim100\%$, and finally + 3. The VALU Active Threads metric (11.2.4) is 64, i.e., the wavefront size on CDNA accelerators, as all threads in the wavefront are active. + +We will explore the difference between the IPC (11.2.0) and "Issued" IPC (11.2.1) metrics in the [next section](Issued_ipc). + +Additionally, we notice a small (0.1%) Branch utilization (11.2.5). +Inspecting the assembly of this kernel shows there are no branch operations, however recalling the note in the [Pipeline statistics](Pipeline_stats) section: + +> the Branch utilization <...> includes time spent in other instruction types (namely: `s_endpgm`) that are _typically_ a very small percentage of the overall kernel execution. + +we see that this is coming from execution of the `s_endpgm` instruction at the end of every wavefront. + +```{note} +Technically, the cycle counts used in the denominators of our IPC metrics are actually in units of quad-cycles, a group of 4 consecutive cycles. +However, a typical [VALU](valu) instruction on CDNA accelerators runs for a single quad-cycle (see [Layla Mah's GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah), slide 30). +Therefore, for simplicity, we simply report these metrics as "instructions per cycle". +``` + +(Issued_ipc)= +### Exploring "Issued" IPC via MFMA operations + +```{warning} +The MFMA assembly operations used in this example are inherently unportable to older CDNA architectures. +``` + +Unlike the simple quad-cycle `v_mov_b32` operation discussed in our [previous example](VALU_ipc), some operations take many quad-cycles to execute. +For example, using the [AMD Matrix Instruction Calculator](https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator#example-of-querying-instruction-information) we can see that some [MFMA](mfma) operations take 64 cycles, e.g.: + +```shell-session +$ ./matrix_calculator.py --arch CDNA2 --detail-instruction --instruction v_mfma_f32_32x32x8bf16_1k +Architecture: CDNA2 +Instruction: V_MFMA_F32_32X32X8BF16_1K +<...> + Execution statistics: + FLOPs: 16384 + Execution cycles: 64 + FLOPs/CU/cycle: 1024 + Can co-execute with VALU: True + VALU co-execution cycles possible: 60 +``` + +What happens to our IPC when we utilize this `v_mfma_f32_32x32x8bf16_1k` instruction on a CDNA2 accelerator? +To find out, we turn to our `mfma` kernel in the IPC example: + +```shell-session +$ omniperf analyze -p workloads/ipc/mi200/ --dispatch 8 -b 11.2 --decimal 4 +<...> +-------------------------------------------------------------------------------- +0. Top Stat +╒════╤═══════════════════════════════╤═════════╤═════════════════╤═════════════════╤═════════════════╤══════════╕ +│ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ +╞════╪═══════════════════════════════╪═════════╪═════════════════╪═════════════════╪═════════════════╪══════════╡ +│ 0 │ void mfma<1000>() [clone .kd] │ 1.0000 │ 1623167595.0000 │ 1623167595.0000 │ 1623167595.0000 │ 100.0000 │ +╘════╧═══════════════════════════════╧═════════╧═════════════════╧═════════════════╧═════════════════╧══════════╛ + + +-------------------------------------------------------------------------------- +11. Compute Units - Compute Pipeline +11.2 Pipeline Stats +╒═════════╤═════════════════════╤═════════╤═════════╤═════════╤══════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═════════════════════╪═════════╪═════════╪═════════╪══════════════╡ +│ 11.2.0 │ IPC │ 0.0626 │ 0.0626 │ 0.0626 │ Instr/cycle │ +├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤ +│ 11.2.1 │ IPC (Issued) │ 1.0000 │ 1.0000 │ 1.0000 │ Instr/cycle │ +├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤ +│ 11.2.2 │ SALU Util │ 0.0000 │ 0.0000 │ 0.0000 │ Pct │ +├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤ +│ 11.2.3 │ VALU Util │ 6.2496 │ 6.2496 │ 6.2496 │ Pct │ +├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤ +│ 11.2.4 │ VMEM Util │ 0.0000 │ 0.0000 │ 0.0000 │ Pct │ +├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤ +│ 11.2.5 │ Branch Util │ 0.0062 │ 0.0062 │ 0.0062 │ Pct │ +├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤ +│ 11.2.6 │ VALU Active Threads │ 64.0000 │ 64.0000 │ 64.0000 │ Threads │ +├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤ +│ 11.2.7 │ MFMA Util │ 99.9939 │ 99.9939 │ 99.9939 │ Pct │ +├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤ +│ 11.2.8 │ MFMA Instr Cycles │ 64.0000 │ 64.0000 │ 64.0000 │ Cycles/instr │ +╘═════════╧═════════════════════╧═════════╧═════════╧═════════╧══════════════╛ +``` + +In contrast to our [VALU IPC example](VALU_ipc), we now see that the IPC metric (11.2.0) and Issued IPC (11.2.1) metric differ substantially. +First, we see the VALU utilization (11.2.3) has decreased substantially, from nearly 100% to $\sim6.25\%$. +We note that this matches the ratio of: + +```math +((Execution\ cycles) - (VALU\ coexecution\ cycles)) / (Execution\ cycles) +``` +reported by the matrix calculator, while the MFMA utilization (11.2.7) has increased to nearly 100%. + + +Recall: our `v_mfma_f32_32x32x8bf16_1k` instruction takes 64 cycles to execute, or 16 quad-cycles, matching our observed MFMA Instruction Cycles (11.2.8). +That is, we have a single instruction executed every 16 quad-cycles, or: + +```math +1/16 = 0.0625 +``` + +which is almost identical to our IPC metric (11.2.0). +Why then is the Issued IPC metric (11.2.1) equal to 1.0 then? + +Instead of simply counting the number of instructions issued and dividing by the number of cycles the [CUs](CU) on the accelerator were active (as is done for 11.2.0), this metric is formulated differently, and instead counts the number of (non-[internal](Internal_ipc)) instructions issued divided by the number of (quad-) cycles where the [scheduler](scheduler) was actively working on issuing instructions. +Thus the Issued IPC metric (11.2.1) gives more of a sense of "what percent of the total number of [scheduler](scheduler) cycles did a wave schedule an instruction?" while the IPC metric (11.2.0) indicates the ratio of the number of instructions executed over the total [active CU cycles](TotalActiveCUCycles). + +```{warning} +There are further complications of the Issued IPC metric (11.2.1) that make its use more complicated. +We will be explore that in the [subsequent section](Internal_ipc). +For these reasons, Omniperf typically promotes use of the regular IPC metric (11.2.0), e.g., in the top-level Speed-of-Light chart. +``` + +(Internal_ipc)= +### "Internal" instructions and IPC + +Next, we explore the concept of an "internal" instruction. +From [Layla Mah's GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah) (slide 29), we see a few candidates for internal instructions, and we choose a `s_nop` instruction, which according to the [CDNA2 ISA Guide](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf): + +>Does nothing; it can be repeated in hardware up to eight times. + +Here we choose to use a no-op of: + +```asm +s_nop 0x0 +``` + +to make our point. Running this kernel through Omniperf yields: + +```shell-session +$ omniperf analyze -p workloads/ipc/mi200/ --dispatch 9 -b 11.2 +<...> +-------------------------------------------------------------------------------- +0. Top Stat +╒════╤═══════════════════════════════╤═════════╤═════════════╤═════════════╤══════════════╤════════╕ +│ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ +╞════╪═══════════════════════════════╪═════════╪═════════════╪═════════════╪══════════════╪════════╡ +│ 0 │ void snop<1000>() [clone .kd] │ 1.00 │ 14221851.50 │ 14221851.50 │ 14221851.50 │ 100.00 │ +╘════╧═══════════════════════════════╧═════════╧═════════════╧═════════════╧══════════════╧════════╛ + + +-------------------------------------------------------------------------------- +11. Compute Units - Compute Pipeline +11.2 Pipeline Stats +╒═════════╤═════════════════════╤═══════╤═══════╤═══════╤══════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═════════════════════╪═══════╪═══════╪═══════╪══════════════╡ +│ 11.2.0 │ IPC │ 6.79 │ 6.79 │ 6.79 │ Instr/cycle │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.1 │ IPC (Issued) │ 1.0 │ 1.0 │ 1.0 │ Instr/cycle │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.2 │ SALU Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.3 │ VALU Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.4 │ VMEM Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.5 │ Branch Util │ 0.68 │ 0.68 │ 0.68 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.6 │ VALU Active Threads │ │ │ │ Threads │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.7 │ MFMA Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.8 │ MFMA Instr Cycles │ │ │ │ Cycles/instr │ +╘═════════╧═════════════════════╧═══════╧═══════╧═══════╧══════════════╛ +``` + +First, we see that the IPC metric (11.2.0) tops our theoretical maximum of 5 instructions per cycle (discussed in the [scheduler](scheduler) section). +How can this be? + +Recall that Layla's slides say "no functional unit" for the internal instructions. +This removes the limitation on the IPC. If we are _only_ issuing internal instructions, we are not issuing to any execution units! +However, workloads such as these are almost _entirely_ artificial (i.e., repeatedly issuing internal instructions almost exclusively). In practice, a maximum of IPC of 5 is expected in almost all cases. + +Secondly, we note that our "Issued" IPC (11.2.1) is still identical to one here. +Again, this has to do with the details of "internal" instructions. +Recall in our [previous example](Issued_ipc) we defined this metric as explicitly excluding internal instruction counts. +The logical question then is, 'what _is_ this metric counting in our `s_nop` kernel?' + +The generated assembly looks something like: + +```asm +;;#ASMSTART +s_nop 0x0 +;;#ASMEND +;;#ASMSTART +s_nop 0x0 +;;#ASMEND +;;<... omitting many more ...> +s_endpgm +.section .rodata,#alloc +.p2align 6, 0x0 +.amdhsa_kernel _Z4snopILi1000EEvv +``` + +Of particular interest here is the `s_endpgm` instruction, of which the [CDNA2 ISA guide](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf) states: + +>End of program; terminate wavefront. + +This is not on our list of internal instructions from Layla's tutorial, and is therefore counted as part of our Issued IPC (11.2.1). +Thus: the issued IPC being equal to one here indicates that we issued an `s_endpgm` instruction every cycle the [scheduler](scheduler) was active for non-internal instructions, which is expected as this was our _only_ non-internal instruction! + + +(SALU_ipc)= +### SALU Utilization + +Next, we explore a simple [SALU](salu) kernel in our on-going IPC and utilization example. +For this case, we select a simple scalar move operation, e.g.: + +```asm +s_mov_b32 s0, s1 +``` + +which, in analogue to our [`v_mov`](VALU_ipc) example, copies the contents of the source scalar register (`s1`) to the destination scalar register (`s0`). +Running this kernel through Omniperf yields: + +```shell-session +$ omniperf analyze -p workloads/ipc/mi200/ --dispatch 10 -b 11.2 +<...> +-------------------------------------------------------------------------------- +0. Top Stat +╒════╤═══════════════════════════════╤═════════╤═════════════╤═════════════╤══════════════╤════════╕ +│ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ +╞════╪═══════════════════════════════╪═════════╪═════════════╪═════════════╪══════════════╪════════╡ +│ 0 │ void smov<1000>() [clone .kd] │ 1.00 │ 96246554.00 │ 96246554.00 │ 96246554.00 │ 100.00 │ +╘════╧═══════════════════════════════╧═════════╧═════════════╧═════════════╧══════════════╧════════╛ + + +-------------------------------------------------------------------------------- +11. Compute Units - Compute Pipeline +11.2 Pipeline Stats +╒═════════╤═════════════════════╤═══════╤═══════╤═══════╤══════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═════════════════════╪═══════╪═══════╪═══════╪══════════════╡ +│ 11.2.0 │ IPC │ 1.0 │ 1.0 │ 1.0 │ Instr/cycle │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.1 │ IPC (Issued) │ 1.0 │ 1.0 │ 1.0 │ Instr/cycle │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.2 │ SALU Util │ 99.98 │ 99.98 │ 99.98 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.3 │ VALU Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.4 │ VMEM Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.5 │ Branch Util │ 0.1 │ 0.1 │ 0.1 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.6 │ VALU Active Threads │ │ │ │ Threads │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.7 │ MFMA Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.8 │ MFMA Instr Cycles │ │ │ │ Cycles/instr │ +╘═════════╧═════════════════════╧═══════╧═══════╧═══════╧══════════════╛ +``` + +Here we see that: + - both our IPC (11.2.0) and Issued IPC (11.2.1) are $\sim1.0$ as expected, and, + - the SALU Utilization (11.2.2) was nearly 100% as it was active for almost the entire kernel. + +(VALU_Active_Threads)= +### VALU Active Threads + +For our final IPC/Utilization example, we consider a slight modification of our [`v_mov`](VALU_ipc) example: + +```c++ +template +__global__ void vmov_with_divergence() { + if (threadIdx.x % 64 == 0) + vmov_op(); +} +``` + +That is, we wrap our [VALU](valu) operation inside a conditional where only one lane in our wavefront is active. +Running this kernel through Omniperf yields: + +```shell-session +$ omniperf analyze -p workloads/ipc/mi200/ --dispatch 11 -b 11.2 +<...> +-------------------------------------------------------------------------------- +0. Top Stat +╒════╤══════════════════════════════════════════╤═════════╤═════════════╤═════════════╤══════════════╤════════╕ +│ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ +╞════╪══════════════════════════════════════════╪═════════╪═════════════╪═════════════╪══════════════╪════════╡ +│ 0 │ void vmov_with_divergence<1000>() [clone │ 1.00 │ 97125097.00 │ 97125097.00 │ 97125097.00 │ 100.00 │ +│ │ .kd] │ │ │ │ │ │ +╘════╧══════════════════════════════════════════╧═════════╧═════════════╧═════════════╧══════════════╧════════╛ + + +-------------------------------------------------------------------------------- +11. Compute Units - Compute Pipeline +11.2 Pipeline Stats +╒═════════╤═════════════════════╤═══════╤═══════╤═══════╤══════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═════════════════════╪═══════╪═══════╪═══════╪══════════════╡ +│ 11.2.0 │ IPC │ 1.0 │ 1.0 │ 1.0 │ Instr/cycle │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.1 │ IPC (Issued) │ 1.0 │ 1.0 │ 1.0 │ Instr/cycle │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.2 │ SALU Util │ 0.1 │ 0.1 │ 0.1 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.3 │ VALU Util │ 99.98 │ 99.98 │ 99.98 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.4 │ VMEM Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.5 │ Branch Util │ 0.2 │ 0.2 │ 0.2 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.6 │ VALU Active Threads │ 1.13 │ 1.13 │ 1.13 │ Threads │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.7 │ MFMA Util │ 0.0 │ 0.0 │ 0.0 │ Pct │ +├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤ +│ 11.2.8 │ MFMA Instr Cycles │ │ │ │ Cycles/instr │ +╘═════════╧═════════════════════╧═══════╧═══════╧═══════╧══════════════╛ +``` + +Here we see that once again, our VALU Utilization (11.2.3) is nearly 100%. +However, we note that the VALU Active Threads metric (11.2.6) is $\sim 1$, which matches our conditional in the source code. +So VALU Active Threads reports the average number of lanes of our wavefront that are active over all [VALU](valu) instructions, or thread "convergence" (i.e., 1 - [divergence](Divergence)). + +```{note} +We note here that: + +1. The act of evaluating a vector conditional in this example typically triggers VALU operations, contributing to why the VALU Active Threads metric is not identically one. +2. This metric is a time (cycle) averaged value, and thus contains an implicit dependence on the duration of various VALU instructions. + +Nonetheless, this metric serves as a useful measure of thread-convergence. +``` + +Finally, we note that our branch utilization (11.2.5) has increased slightly from our baseline, as we now have a branch (checking the value of `threadIdx.x`). + +## LDS Examples + +For this example, we consider the [LDS sample](https://github.com/AMDResearch/omniperf/blob/dev/sample/lds.hip) distributed as a part of Omniperf. +This code contains two kernels to explore how both [LDS](lds) bandwidth and bank conflicts are calculated in Omniperf. + +This example was compiled and run on an MI250 accelerator using ROCm v5.6.0, and Omniperf v2.0.0. +```shell-session +$ hipcc -O3 lds.hip -o lds +``` + +Finally, we generate our omniperf profile as: +```shell-session +$ omniperf profile -n lds --no-roof -- ./lds +``` + +(lds_bandwidth)= +### LDS Bandwidth + +To explore our 'theoretical LDS bandwidth' metric, we use a simple kernel: + +```c++ +constexpr unsigned max_threads = 256; +__global__ void load(int* out, int flag) { + __shared__ int array[max_threads]; + int index = threadIdx.x; + // fake a store to the LDS array to avoid unwanted behavior + if (flag) + array[max_threads - index] = index; + __syncthreads(); + int x = array[index]; + if (x == int(-1234567)) + out[threadIdx.x] = x; +} +``` + +Here we: + - Create an array of 256 integers in [LDS](lds) + - Fake a write to the LDS using the `flag` variable (always set to zero on the host) to avoid dead-code elimination + - Read a single integer per work-item from `threadIdx.x` of the LDS array + - If the integer is equal to a magic number (always false), write the value out to global memory to again, avoid dead-code elimination + +Finally, we launch this kernel repeatedly, varying the number of threads in our workgroup: + +```c++ +void bandwidth_demo(int N) { + for (int i = 1; i <= N; ++i) + load<<<1,i>>>(nullptr, 0); + hipDeviceSynchronize(); +} +``` + +Next, let's analyze the first of our bandwidth kernel dispatches: + +```shell-session +$ omniperf analyze -p workloads/lds/mi200/ -b 12.2.1 --dispatch 0 -n per_kernel +<...> +12. Local Data Share (LDS) +12.2 LDS Stats +╒═════════╤═══════════════════════╤════════╤════════╤════════╤══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪═══════════════════════╪════════╪════════╪════════╪══════════════════╡ +│ 12.2.1 │ Theoretical Bandwidth │ 256.00 │ 256.00 │ 256.00 │ Bytes per kernel │ +╘═════════╧═══════════════════════╧════════╧════════╧════════╧══════════════════╛ +``` + +Here we see that our Theoretical Bandwidth metric (12.2.1) is reporting 256 Bytes were loaded even though we launched a single work-item workgroup, and thus only loaded a single integer from LDS. Why is this? + +Recall our definition of this metric: + +> Indicates the maximum amount of bytes that could have been loaded from/stored to/atomically updated in the LDS per [normalization-unit](normunit). + +Here we see that this instruction _could_ have loaded up to 256 bytes of data (4 bytes for each work-item in the wavefront), and therefore this is the expected value for this metric in Omniperf, hence why this metric is named the "theoretical" bandwidth. + +To further illustrate this point we plot the relationship of the theoretical bandwidth metric (12.2.1) as compared to the effective (or achieved) bandwidth of this kernel, varying the number of work-items launched from 1 to 256: + +```{figure} images/ldsbandwidth.* +:scale: 50 % +:alt: Comparison of effective bandwidth versus the theoretical bandwidth metric in Omniperf for our simple example. +:align: center + +Comparison of effective bandwidth versus the theoretical bandwidth metric in Omniperf for our simple example. +``` + +Here we see that the theoretical bandwidth metric follows a step-function. It increases only when another wavefront issues an LDS instruction for up to 256 bytes of data. Such increases are marked in the plot using dashed lines. +In contrast, the effective bandwidth increases linearly, by 4 bytes, with the number of work-items in the kernel, N. + +(lds_bank_conflicts)= +### Bank Conflicts + +Next we explore bank conflicts using a slight modification of our bandwidth kernel: + +```c++ +constexpr unsigned nbanks = 32; +__global__ void conflicts(int* out, int flag) { + constexpr unsigned nelements = nbanks * max_threads; + __shared__ int array[nelements]; + // each thread reads from the same bank + int index = threadIdx.x * nbanks; + // fake a store to the LDS array to avoid unwanted behavior + if (flag) + array[max_threads - index] = index; + __syncthreads(); + int x = array[index]; + if (x == int(-1234567)) + out[threadIdx.x] = x; +} +``` + +Here we: + - Allocate an [LDS](lds) array of size $32*256*4{B}=32{KiB}$ + - Fake a write to the LDS using the `flag` variable (always set to zero on the host) to avoid dead-code elimination + - Read a single integer per work-item from index `threadIdx.x * nbanks` of the LDS array + - If the integer is equal to a magic number (always false), write the value out to global memory to, again, avoid dead-code elimination. + +On the host, we again repeatedly launch this kernel, varying the number of work-items: + +```c++ +void conflicts_demo(int N) { + for (int i = 1; i <= N; ++i) + conflicts<<<1,i>>>(nullptr, 0); + hipDeviceSynchronize(); +} +``` + +Analyzing our first `conflicts` kernel (i.e., a single work-item), we see: + +```shell-session +$ omniperf analyze -p workloads/lds/mi200/ -b 12.2.4 12.2.6 --dispatch 256 -n per_kernel +<...> +-------------------------------------------------------------------------------- +12. Local Data Share (LDS) +12.2 LDS Stats +╒═════════╤════════════════╤═══════╤═══════╤═══════╤═══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪════════════════╪═══════╪═══════╪═══════╪═══════════════════╡ +│ 12.2.4 │ Index Accesses │ 2.00 │ 2.00 │ 2.00 │ Cycles per kernel │ +├─────────┼────────────────┼───────┼───────┼───────┼───────────────────┤ +│ 12.2.6 │ Bank Conflict │ 0.00 │ 0.00 │ 0.00 │ Cycles per kernel │ +╘═════════╧════════════════╧═══════╧═══════╧═══════╧═══════════════════╛ +``` + +In our [previous example](lds_bank_conflicts), we showed how a load from a single work-item is considered to have a theoretical bandwidth of 256B. +Recall, the [LDS](lds) can load up to $128B$ per cycle (i.e, 32 banks x 4B / bank / cycle). +Hence, we see that loading an 4B integer spends two cycles accessing the LDS ($2\ {cycle} = (256B) / (128\ B/{cycle})$). + +Looking at the next `conflicts` dispatch (i.e., two work-items) yields: + +```shell-session +$ omniperf analyze -p workloads/lds/mi200/ -b 12.2.4 12.2.6 --dispatch 257 -n per_kernel +<...> +-------------------------------------------------------------------------------- +12. Local Data Share (LDS) +12.2 LDS Stats +╒═════════╤════════════════╤═══════╤═══════╤═══════╤═══════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪════════════════╪═══════╪═══════╪═══════╪═══════════════════╡ +│ 12.2.4 │ Index Accesses │ 3.00 │ 3.00 │ 3.00 │ Cycles per kernel │ +├─────────┼────────────────┼───────┼───────┼───────┼───────────────────┤ +│ 12.2.6 │ Bank Conflict │ 1.00 │ 1.00 │ 1.00 │ Cycles per kernel │ +╘═════════╧════════════════╧═══════╧═══════╧═══════╧═══════════════════╛ +``` + +Here we see a bank conflict! What happened? + +Recall that the index for each thread was calculated as: + +```c++ +int index = threadIdx.x * nbanks; +``` + +Or, precisely 32 elements, and each element is 4B wide (for a standard integer). +That is, each thread strides back to the same bank in the LDS, such that each work-item we add to the dispatch results in another bank conflict! + +Recalling our discussion of bank conflicts in our [LDS](lds) description: + +>A bank conflict occurs when two (or more) work-items in a wavefront want to read, write, or atomically update different addresses that map to the same bank in the same cycle. +In this case, the conflict detection hardware will determined a new schedule such that the **access is split into multiple cycles with no conflicts in any single cycle.** + +Here we see the conflict resolution hardware in action! Because we have engineered our kernel to generate conflicts, we expect our bank conflict metric to scale linearly with the number of work-items: + +```{figure} images/ldsconflicts.* +:scale: 50 % +:alt: Comparison of LDS conflict cycles versus access cycles for our simple example. +:align: center + +Comparison of LDS conflict cycles versus access cycles for our simple example. +``` + +Here we show the comparison of the Index Accesses (12.2.4), to the Bank Conflicts (12.2.6) for the first 20 kernel invocations. +We see that each grows linearly, and there is a constant gap of 2 cycles between them (i.e., the first access is never considered a conflict). + + +Finally, we can use these two metrics to derive the Bank Conflict Rate (12.1.4). Since within an Index Access we have 32 banks that may need to be updated, we use: + +$$ +Bank\ Conflict\ Rate = 100 * ((Bank\ Conflicts / 32) / (Index\ Accesses - Bank\ Conflicts)) +$$ + +Plotting this, we see: + +```{figure} images/ldsconflictrate.* +:scale: 50 % +:alt: LDS Bank Conflict rate for our simple example. +:align: center + +LDS Bank Conflict rate for our simple example. +``` + +The bank conflict rate linearly increases with the number of work-items within a wavefront that are active, _approaching_ 100\%, but never quite reaching it. + + +(Occupancy_example)= +## Occupancy Limiters Example + + +In this [example](https://github.com/AMDResearch/omniperf/blob/dev/sample/occupancy.hip), we will investigate the use of the resource allocation panel in the [Workgroup Manager](SPI)'s metrics section to determine occupancy limiters. +This code contains several kernels to explore how both various kernel resources impact achieved occupancy, and how this is reported in Omniperf. + +This example was compiled and run on a MI250 accelerator using ROCm v5.6.0, and Omniperf v2.0.0: +```shell-session +$ hipcc -O3 occupancy.hip -o occupancy --save-temps +``` +We have again included the `--save-temps` flag to get the corresponding assembly. + +Finally, we generate our Omniperf profile as: +```shell-session +$ omniperf profile -n occupancy --no-roof -- ./occupancy +``` + +(Occupancy_experiment_design)= +### Design note + +For our occupancy test, we need to create a kernel that is resource heavy, in various ways. +For this purpose, we use the following (somewhat funny-looking) kernel: + +```c++ +constexpr int bound = 16; +__launch_bounds__(256) +__global__ void vgprbound(int N, double* ptr) { + double intermediates[bound]; + for (int i = 0 ; i < bound; ++i) intermediates[i] = N * threadIdx.x; + double x = ptr[threadIdx.x]; + for (int i = 0; i < 100; ++i) { + x += sin(pow(__shfl(x, i % warpSize) * intermediates[(i - 1) % bound], intermediates[i % bound])); + intermediates[i % bound] = x; + } + if (x == N) ptr[threadIdx.x] = x; +} +``` + +Here we try to use as many [VGPRs](valu) as possible, to this end: + - We create a small array of double precision floats, that we size to try to fit into registers (i.e., `bound`, this may need to be tuned depending on the ROCm version). + - We specify `__launch_bounds___(256)` to increase the number of VPGRs available to the kernel (by limiting the number of wavefronts that can be resident on a [CU](CU)). + - Write a unique non-compile time constant to each element of the array. + - Repeatedly permute and call relatively expensive math functions on our array elements. + - Keep the compiler from optimizing out any operations by faking a write to the `ptr` based on a run-time conditional. + +This yields a total of 122 VGPRs, but it is expected this number will depend on the exact ROCm/compiler version. + +```asm + .size _Z9vgprboundiPd, .Lfunc_end1-_Z9vgprboundiPd + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 4732 +; NumSgprs: 68 +; NumVgprs: 122 +; NumAgprs: 0 +; <...> +; AccumOffset: 124 +``` + +We will use various permutations of this kernel to limit occupancy, and more importantly for the purposes of this example, demonstrate how this is reported in Omniperf. + +(VGPR_occupancy)= +### VGPR Limited + +For our first test, we use the `vgprbound` kernel discussed in the [design note](Occupancy_experiment_design). +After profiling, we run the analyze step on this kernel: + +```shell-session +$ omniperf analyze -p workloads/occupancy/mi200/ -b 2.1.15 6.2 7.1.5 7.1.6 7.1.7 --dispatch 1 +<...> +-------------------------------------------------------------------------------- +0. Top Stat +╒════╤═════════════════════════╤═════════╤══════════════╤══════════════╤══════════════╤════════╕ +│ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ +╞════╪═════════════════════════╪═════════╪══════════════╪══════════════╪══════════════╪════════╡ +│ 0 │ vgprbound(int, double*) │ 1.00 │ 923093822.50 │ 923093822.50 │ 923093822.50 │ 100.00 │ +╘════╧═════════════════════════╧═════════╧══════════════╧══════════════╧══════════════╧════════╛ + + +-------------------------------------------------------------------------------- +2. System Speed-of-Light +2.1 Speed-of-Light +╒═════════╤═════════════════════╤═════════╤════════════╤═════════╤═══════════════╕ +│ Index │ Metric │ Avg │ Unit │ Peak │ Pct of Peak │ +╞═════════╪═════════════════════╪═════════╪════════════╪═════════╪═══════════════╡ +│ 2.1.15 │ Wavefront Occupancy │ 1661.24 │ Wavefronts │ 3328.00 │ 49.92 │ +╘═════════╧═════════════════════╧═════════╧════════════╧═════════╧═══════════════╛ + + +-------------------------------------------------------------------------------- +6. Workgroup Manager (SPI) +6.2 Workgroup Manager - Resource Allocation +╒═════════╤════════════════════════════════════════╤═══════╤═══════╤═══════╤════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪════════════════════════════════════════╪═══════╪═══════╪═══════╪════════╡ +│ 6.2.0 │ Not-scheduled Rate (Workgroup Manager) │ 0.64 │ 0.64 │ 0.64 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.1 │ Not-scheduled Rate (Scheduler-Pipe) │ 24.94 │ 24.94 │ 24.94 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.2 │ Scheduler-Pipe Stall Rate │ 24.49 │ 24.49 │ 24.49 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.3 │ Scratch Stall Rate │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.4 │ Insufficient SIMD Waveslots │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.5 │ Insufficient SIMD VGPRs │ 94.90 │ 94.90 │ 94.90 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.6 │ Insufficient SIMD SGPRs │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.7 │ Insufficient CU LDS │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.8 │ Insufficient CU Barriers │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.9 │ Reached CU Workgroup Limit │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.10 │ Reached CU Wavefront Limit │ 0.00 │ 0.00 │ 0.00 │ Pct │ +╘═════════╧════════════════════════════════════════╧═══════╧═══════╧═══════╧════════╛ + + +-------------------------------------------------------------------------------- +7. Wavefront +7.1 Wavefront Launch Stats +╒═════════╤══════════╤════════╤════════╤════════╤═══════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪══════════╪════════╪════════╪════════╪═══════════╡ +│ 7.1.5 │ VGPRs │ 124.00 │ 124.00 │ 124.00 │ Registers │ +├─────────┼──────────┼────────┼────────┼────────┼───────────┤ +│ 7.1.6 │ AGPRs │ 4.00 │ 4.00 │ 4.00 │ Registers │ +├─────────┼──────────┼────────┼────────┼────────┼───────────┤ +│ 7.1.7 │ SGPRs │ 80.00 │ 80.00 │ 80.00 │ Registers │ +╘═════════╧══════════╧════════╧════════╧════════╧═══════════╛ +``` + +Here we see that the kernel indeed does use _around_ (but not exactly) 122 VGPRs, with the difference due to granularity of VGPR allocations. +In addition, we see that we have allocated 4 "[AGPRs](agprs)". +We note that on current CDNA2 accelerators, the `AccumOffset` field of the assembly metadata: +```asm +; AccumOffset: 124 +``` +denotes the divide between `VGPRs` and `AGPRs`. + + +Next, we examine our wavefront occupancy (2.1.15), and see that we are reaching only $\sim50\%$ of peak occupancy. +As a result, we see that: + - We are not scheduling workgroups $\sim25\%$ of [total scheduler-pipe cycles](TotalPipeCycles) (6.2.1); recall from the discussion of the [Workgroup manager](SPI), 25\% is the maximum. + - The scheduler-pipe is stalled (6.2.2) from scheduling workgroups due to resource constraints for the same $\sim25\%$ of the time. + - And finally, $\sim91\%$ of those stalls are due to a lack of SIMDs with the appropriate number of VGPRs available (6.2.5). + +That is, the reason we can't reach full occupancy is due to our VGPR usage, as expected! + +### LDS Limited + +To examine an LDS limited example, we must change our kernel slightly: + +```c++ +constexpr size_t fully_allocate_lds = 64ul * 1024ul / sizeof(double); +__launch_bounds__(256) +__global__ void ldsbound(int N, double* ptr) { + __shared__ double intermediates[fully_allocate_lds]; + for (int i = threadIdx.x ; i < fully_allocate_lds; i += blockDim.x) intermediates[i] = N * threadIdx.x; + __syncthreads(); + double x = ptr[threadIdx.x]; + for (int i = threadIdx.x; i < fully_allocate_lds; i += blockDim.x) { + x += sin(pow(__shfl(x, i % warpSize) * intermediates[(i - 1) % fully_allocate_lds], intermediates[i % fully_allocate_lds])); + __syncthreads(); + intermediates[i % fully_allocate_lds] = x; + } + if (x == N) ptr[threadIdx.x] = x; +} +``` + +where we now: + - allocate an 64 KiB LDS array per workgroup, and + - use our allocated LDS array instead of a register array + +Analyzing this: + +```shell-session +$ omniperf analyze -p workloads/occupancy/mi200/ -b 2.1.15 6.2 7.1.5 7.1.6 7.1.7 7.1.8 --dispatch 3 +<...> +-------------------------------------------------------------------------------- +2. System Speed-of-Light +2.1 Speed-of-Light +╒═════════╤═════════════════════╤════════╤════════════╤═════════╤═══════════════╕ +│ Index │ Metric │ Avg │ Unit │ Peak │ Pct of Peak │ +╞═════════╪═════════════════════╪════════╪════════════╪═════════╪═══════════════╡ +│ 2.1.15 │ Wavefront Occupancy │ 415.52 │ Wavefronts │ 3328.00 │ 12.49 │ +╘═════════╧═════════════════════╧════════╧════════════╧═════════╧═══════════════╛ + + +-------------------------------------------------------------------------------- +6. Workgroup Manager (SPI) +6.2 Workgroup Manager - Resource Allocation +╒═════════╤════════════════════════════════════════╤═══════╤═══════╤═══════╤════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪════════════════════════════════════════╪═══════╪═══════╪═══════╪════════╡ +│ 6.2.0 │ Not-scheduled Rate (Workgroup Manager) │ 0.13 │ 0.13 │ 0.13 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.1 │ Not-scheduled Rate (Scheduler-Pipe) │ 24.87 │ 24.87 │ 24.87 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.2 │ Scheduler-Pipe Stall Rate │ 24.84 │ 24.84 │ 24.84 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.3 │ Scratch Stall Rate │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.4 │ Insufficient SIMD Waveslots │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.5 │ Insufficient SIMD VGPRs │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.6 │ Insufficient SIMD SGPRs │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.7 │ Insufficient CU LDS │ 96.47 │ 96.47 │ 96.47 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.8 │ Insufficient CU Barriers │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.9 │ Reached CU Workgroup Limit │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.10 │ Reached CU Wavefront Limit │ 0.00 │ 0.00 │ 0.00 │ Pct │ +╘═════════╧════════════════════════════════════════╧═══════╧═══════╧═══════╧════════╛ + + +-------------------------------------------------------------------------------- +7. Wavefront +7.1 Wavefront Launch Stats +╒═════════╤════════════════╤══════════╤══════════╤══════════╤═══════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪════════════════╪══════════╪══════════╪══════════╪═══════════╡ +│ 7.1.5 │ VGPRs │ 96.00 │ 96.00 │ 96.00 │ Registers │ +├─────────┼────────────────┼──────────┼──────────┼──────────┼───────────┤ +│ 7.1.6 │ AGPRs │ 0.00 │ 0.00 │ 0.00 │ Registers │ +├─────────┼────────────────┼──────────┼──────────┼──────────┼───────────┤ +│ 7.1.7 │ SGPRs │ 80.00 │ 80.00 │ 80.00 │ Registers │ +├─────────┼────────────────┼──────────┼──────────┼──────────┼───────────┤ +│ 7.1.8 │ LDS Allocation │ 65536.00 │ 65536.00 │ 65536.00 │ Bytes │ +╘═════════╧════════════════╧══════════╧══════════╧══════════╧═══════════╛ +``` + +We see that our VGPR allocation has gone down to 96 registers, but now we see our 64KiB LDS allocation (7.1.8). +In addition, we see a similar non-schedule rate (6.2.1) and stall rate (6.2.2) as in our [VGPR example](VGPR_occupancy). However, our occupancy limiter has now shifted from VGPRs (6.2.5) to LDS (6.2.7). + + +We note that although we see the around the same scheduler/stall rates (with our LDS limiter), our wave occupancy (2.1.15) is significantly lower ($\sim12\%$)! +This is important to remember: the occupancy limiter metrics in the resource allocation section tell you what the limiter was, but _not_ how much the occupancy was limited. +These metrics should always be analyzed in concert with the wavefront occupancy metric! + +### SGPR Limited + +Finally, we modify our kernel once more to make it limited by [SGPRs](salu): + +```c++ +constexpr int sgprlim = 1; +__launch_bounds__(1024, 8) +__global__ void sgprbound(int N, double* ptr) { + double intermediates[sgprlim]; + for (int i = 0 ; i < sgprlim; ++i) intermediates[i] = i; + double x = ptr[0]; + #pragma unroll 1 + for (int i = 0; i < 100; ++i) { + x += sin(pow(intermediates[(i - 1) % sgprlim], intermediates[i % sgprlim])); + intermediates[i % sgprlim] = x; + } + if (x == N) ptr[0] = x; +} +``` + +The major changes here are to: + - make as much as possible provably uniform across the wave (notice the lack of `threadIdx.x` in the `intermediates` initialization and elsewhere), + - addition of `__launch_bounds__(1024, 8)`, which reduces our maximum VGPRs to 64 (such that 8 waves can fit per SIMD), but causes some register spills (i.e., [Scratch](Mspace) usage), and + - lower the `bound` (here we use `sgprlim`) of the array to reduce VGPR/Scratch usage + +This results in the following assembly metadata for this kernel: +```asm + .size _Z9sgprboundiPd, .Lfunc_end3-_Z9sgprboundiPd + ; -- End function + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 4872 +; NumSgprs: 76 +; NumVgprs: 64 +; NumAgprs: 0 +; TotalNumVgprs: 64 +; ScratchSize: 60 +; <...> +; AccumOffset: 64 +; Occupancy: 8 +``` + +Analyzing this workload yields: + +```shell-session +$ omniperf analyze -p workloads/occupancy/mi200/ -b 2.1.15 6.2 7.1.5 7.1.6 7.1.7 7.1.8 7.1.9 --dispatch 5 +<...> +-------------------------------------------------------------------------------- +0. Top Stat +╒════╤═════════════════════════╤═════════╤══════════════╤══════════════╤══════════════╤════════╕ +│ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ +╞════╪═════════════════════════╪═════════╪══════════════╪══════════════╪══════════════╪════════╡ +│ 0 │ sgprbound(int, double*) │ 1.00 │ 782069812.00 │ 782069812.00 │ 782069812.00 │ 100.00 │ +╘════╧═════════════════════════╧═════════╧══════════════╧══════════════╧══════════════╧════════╛ + + +-------------------------------------------------------------------------------- +2. System Speed-of-Light +2.1 Speed-of-Light +╒═════════╤═════════════════════╤═════════╤════════════╤═════════╤═══════════════╕ +│ Index │ Metric │ Avg │ Unit │ Peak │ Pct of Peak │ +╞═════════╪═════════════════════╪═════════╪════════════╪═════════╪═══════════════╡ +│ 2.1.15 │ Wavefront Occupancy │ 3291.76 │ Wavefronts │ 3328.00 │ 98.91 │ +╘═════════╧═════════════════════╧═════════╧════════════╧═════════╧═══════════════╛ + + +-------------------------------------------------------------------------------- +6. Workgroup Manager (SPI) +6.2 Workgroup Manager - Resource Allocation +╒═════════╤════════════════════════════════════════╤═══════╤═══════╤═══════╤════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪════════════════════════════════════════╪═══════╪═══════╪═══════╪════════╡ +│ 6.2.0 │ Not-scheduled Rate (Workgroup Manager) │ 7.72 │ 7.72 │ 7.72 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.1 │ Not-scheduled Rate (Scheduler-Pipe) │ 15.17 │ 15.17 │ 15.17 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.2 │ Scheduler-Pipe Stall Rate │ 7.38 │ 7.38 │ 7.38 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.3 │ Scratch Stall Rate │ 39.76 │ 39.76 │ 39.76 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.4 │ Insufficient SIMD Waveslots │ 26.32 │ 26.32 │ 26.32 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.5 │ Insufficient SIMD VGPRs │ 26.32 │ 26.32 │ 26.32 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.6 │ Insufficient SIMD SGPRs │ 25.52 │ 25.52 │ 25.52 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.7 │ Insufficient CU LDS │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.8 │ Insufficient CU Barriers │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.9 │ Reached CU Workgroup Limit │ 0.00 │ 0.00 │ 0.00 │ Pct │ +├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤ +│ 6.2.10 │ Reached CU Wavefront Limit │ 0.00 │ 0.00 │ 0.00 │ Pct │ +╘═════════╧════════════════════════════════════════╧═══════╧═══════╧═══════╧════════╛ + + +-------------------------------------------------------------------------------- +7. Wavefront +7.1 Wavefront Launch Stats +╒═════════╤════════════════════╤═══════╤═══════╤═══════╤════════════════╕ +│ Index │ Metric │ Avg │ Min │ Max │ Unit │ +╞═════════╪════════════════════╪═══════╪═══════╪═══════╪════════════════╡ +│ 7.1.5 │ VGPRs │ 64.00 │ 64.00 │ 64.00 │ Registers │ +├─────────┼────────────────────┼───────┼───────┼───────┼────────────────┤ +│ 7.1.6 │ AGPRs │ 0.00 │ 0.00 │ 0.00 │ Registers │ +├─────────┼────────────────────┼───────┼───────┼───────┼────────────────┤ +│ 7.1.7 │ SGPRs │ 80.00 │ 80.00 │ 80.00 │ Registers │ +├─────────┼────────────────────┼───────┼───────┼───────┼────────────────┤ +│ 7.1.8 │ LDS Allocation │ 0.00 │ 0.00 │ 0.00 │ Bytes │ +├─────────┼────────────────────┼───────┼───────┼───────┼────────────────┤ +│ 7.1.9 │ Scratch Allocation │ 60.00 │ 60.00 │ 60.00 │ Bytes/workitem │ +╘═════════╧════════════════════╧═══════╧═══════╧═══════╧════════════════╛ +``` + +Here we see that our wavefront launch stats (7.1) have changed to reflect the metadata seen in the `--save-temps` output. +Of particular interest, we see: + - The SGPR allocation (7.1.7) is 80 registers, slightly more than the 76 requested by the compiler due to allocation granularity, and + - We have a ['scratch'](Mspace) i.e., private memory, allocation of 60 bytes per work-item + +Analyzing the resource allocation block (6.2) we now see that for the first time, the 'Not-scheduled Rate (Workgroup Manager)' metric (6.2.0) has become non-zero. This is because the workgroup manager is responsible for management of scratch, which we see also contributes to our occupancy limiters in the 'Scratch Stall Rate' (6.2.3). We note that the sum of the workgroup manager not-scheduled rate and the scheduler-pipe non-scheduled rate is still $\sim25\%$, as in our previous examples + +Next, we see that the scheduler-pipe stall rate (6.2.2), i.e., how often we could not schedule a workgroup to a CU was only about $\sim8\%$. +This hints that perhaps, our kernel is not _particularly_ occupancy limited by resources, and indeed checking the wave occupancy metric (2.1.15) shows that this kernel is reaching nearly 99% occupancy! + +Finally, we inspect the occupancy limiter metrics and see a roughly even split between [waveslots](valu) (6.2.4), [VGPRs](valu) (6.2.5), and [SGPRs](salu) (6.2.6) along with the scratch stalls (6.2.3) previously mentioned. + +This is yet another reminder to view occupancy holistically. +While these metrics tell you why a workgroup cannot be scheduled, they do _not_ tell you what your occupancy was (consult wavefront occupancy) _nor_ whether increasing occupancy will be beneficial to performance. + diff --git a/src/docs/profiling.md b/src/docs/profiling.md index 56c234604..14d212b2c 100644 --- a/src/docs/profiling.md +++ b/src/docs/profiling.md @@ -37,7 +37,7 @@ Releasing CPU memory ``` ## Omniperf Profiling -The *omniperf* script, availible through the [Omniperf](https://github.com/AMDResearch/omniperf) repository, is used to aquire all necessary perfmon data through analysis of compute workloads. +The *omniperf* script, available through the Omniperf repository, is used to aquire all necessary performance monitoring data through analysis of compute workloads. **omniperf help:** ```shell-session @@ -80,7 +80,7 @@ Profile Options: -p , --path Specify path to save workload. (DEFAULT: /home/colramos/GitHub/omniperf/workloads/) -k [ ...], --kernel [ ...] Kernel filtering. - -b [ ...], --ipblocks [ ...] IP block filtering: + -b [ ...], --ipblocks [ ...] Hardware block filtering: SQ SQC TA @@ -108,6 +108,13 @@ Standalone Roofline Options: --kernel-names Include kernel names in roofline plot. ``` +- The `-k` \ flag allows for kernel filtering, which is compatible with the current rocProf utility. + +- The `-d` \ flag allows for dispatch ID filtering, which is compatible with the current rocProf utility. + +- The `-b` \ allows system profiling on one or more selected hardware components to speed up the profiling process. One can gradually include more hardware components, without overwriting performance data acquired on other hardware components. + + The following sample command profiles the *vcopy* workload. **vcopy profiling:** @@ -116,7 +123,6 @@ $ omniperf profile --name vcopy -- ./vcopy 1048576 256 Resolving rocprof ROC Profiler: /usr/bin/rocprof - ------------- Profile only ------------- @@ -152,7 +158,7 @@ Finished executing kernel Finished copying the output vector from the GPU to the CPU Releasing GPU memory Releasing CPU memory - + ... ... ROCPRofiler: 1 contexts collected, output directory /tmp/rpl_data_220527_130317_1787038/input_results_220527_130317 File 'workloads/vcopy/mi200/timestamps.csv' is generating @@ -204,16 +210,16 @@ Peak MFMA FLOPs (F64), GPU ID: 1, workgroupSize:256, workgroups:16384, experimen 99% [||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ] Peak MFMA IOPs (I8), GPU ID: 1, workgroupSize:256, workgroups:16384, experiments:100, IOP:2147483648000, duration:14.3 ms, mean:150317.8 GOPS, stdev=203.5 GOPS ``` -You'll notice two stages in *default* Omniperf profiling. The first stage collects all the counters needed for Omniperf analysis (omitting any filters you've provided). The second stage collects data for the roofline analysis (this stage can be disabled using `--no-roof`) +You will notice two stages in *default* Omniperf profiling. The first stage collects all the counters needed for Omniperf analysis (omitting any filters you have provided). The second stage collects data for the roofline analysis (this stage can be disabled using `--no-roof`) -At the end of the profiling, all resulting csv files should be located in a SOC specific target directory, e.g.: - - "mi200" for the AMD Instinct (tm) MI-200 family of accelerators - - "mi100" for the AMD Instinct (tm) MI-100 family of accelerators -etc. The SOC names are generated as a part of Omniperf, and do not necessarily distinguish between different accelerators in the same family (e.g., an AMD Instinct (tm) MI-210 vs an MI-250) +In this document, we use the term System on Chip (SoC) to refer to a particular family of accelerators. At the end of profiling, all resulting csv files should be located in a SoC specific target directory, e.g.: + - "mi200" for the AMD Instinct (tm) MI200 family of accelerators + - "mi100" for the AMD Instinct (tm) MI100 family of accelerators +etc. The SoC names are generated as a part of Omniperf, and do not necessarily distinguish between different accelerators in the same family (e.g., an AMD Instinct (tm) MI210 vs an MI250) -> Note: Additionally, you'll notice a few extra files. An SoC parameters file, *sysinfo.csv*, is created to reflect the target device settings. All profiling output is stored in *log.txt*. Roofline specific benchmark results are stored in *roofline.csv*. +> Note: Additionally, you will notice a few extra files. An SoC parameters file, *sysinfo.csv*, is created to reflect the target device settings. All profiling output is stored in *log.txt*. Roofline specific benchmark results are stored in *roofline.csv*. -```shell +```shell-session $ ls workloads/vcopy/mi200/ total 112 drwxrwxr-x 3 colramos colramos 4096 Apr 11 16:42 . @@ -232,17 +238,17 @@ drwxrwxr-x 2 colramos colramos 4096 Apr 11 16:42 perfmon ``` ### Filtering -To reduce profiling time and the counters collected one may use profiling filters. Profiling filters and their functionality depend on the underlying profiler being used. While Omniperf is profiler agnostic, we've provided a detailed description of profiling filters available when using Omniperf with [rocProfiler](https://rocm.docs.amd.com/projects/rocprofiler/en/latest/rocprof.html) below. +To reduce profiling time and the counters collected one may use profiling filters. Profiling filters and their functionality depend on the underlying profiler being used. While Omniperf is profiler agnostic, we have provided a detailed description of profiling filters available when using Omniperf with [rocProf](https://rocm.docs.amd.com/projects/rocprofiler/en/latest/rocprof.html) below. Filtering Options: -- The `-k` \ flag allows for kernel filtering. Useage is equivalent with the current rocprof utility ([see details below](#kernel-filtering)). +- The `-k` \ flag allows for kernel filtering. Useage is equivalent with the current rocProf utility ([see details below](#kernel-filtering)). -- The `-d` \ flag allows for dispatch ID filtering. Useage is equivalent with the current rocprof utility ([see details below](#dispatch-filtering)). +- The `-d` \ flag allows for dispatch ID filtering. Useage is equivalent with the current rocProf utility ([see details below](#dispatch-filtering)). -- The `-b` \ allows system profiling on one or more selected IP blocks to speed up the profiling process. One can gradually incorporate more IP blocks, without overwriting performance data acquired on other IP blocks. +- The `-b` \ allows system profiling on one or more selected hardware components to speed up the profiling process. One can gradually include more hardware components, without overwriting performance data acquired on other hardware components. ```{note} Be cautious while combining different profiling filters in the same call. Conflicting filters may result in error. @@ -250,11 +256,11 @@ Be cautious while combining different profiling filters in the same call. Confli i.e. filtering dispatch X, but dispatch X does not match your kernel name filter ``` -#### IP Block Filtering -One can profile a selected IP Block to speed up the profiling process. All profiling results are accumulated in the same target directory, without overwriting those for other IP blocks, hence enabling the incremental profiling and analysis. +#### Hardware Component Filtering +One can profile specific hardware components to speed up the profiling process. In Omniperf, we use the term IP block to refer to a hardware component or a group of hardware components. All profiling results are accumulated in the same target directory, without overwriting those for other hardware components, hence enabling the incremental profiling and analysis. -The following example only gathers hardware counters for SQ and TCC, skipping all other IP Blocks: -```shell +The following example only gathers hardware counters for the Shader Sequencer (SQ) and L2 Cache (TCC) components, skipping all other hardware components: +```shell-session $ omniperf profile --name vcopy -b SQ TCC -- ./sample/vcopy 1048576 256 Resolving rocprof ROC Profiler: /usr/bin/rocprof @@ -291,15 +297,14 @@ Log: /home/colramos/GitHub/omniperf-pub/workloads/vcopy/mi200/log.txt ``` #### Kernel Filtering -Kernel filtering is based on the name of the kernel(s) you'd like to isolate. Use a kernel name substring list to isolate desired kernels. +Kernel filtering is based on the name of the kernel(s) you would like to isolate. Use a kernel name substring list to isolate desired kernels. The following example demonstrates profiling isolating the kernel matching substring "vecCopy": -```shell +```shell-session $ omniperf profile --name vcopy -k vecCopy -- ./vcopy 1048576 256 Resolving rocprof ROC Profiler: /usr/bin/rocprof - ------------- Profile only ------------- @@ -323,7 +328,7 @@ Finished allocating vectors on the CPU ROCProfiler: input from "/tmp/rpl_data_230411_170300_29696/input0.xml" gpu_index = kernel = vecCopy - + ... ... ``` @@ -336,7 +341,6 @@ $ omniperf profile --name vcopy -d 0 -- ./vcopy 1048576 256 Resolving rocprof ROC Profiler: /usr/bin/rocprof - ------------- Profile only ------------- @@ -365,19 +369,18 @@ ROCProfiler: input from "/tmp/rpl_data_230411_170356_30314/input0.xml" ``` - ### Standalone Roofline -If you're only interested in generating roofline analysis data try using `--roof-only`. This will only collect counters relevent to roofline, as well as generate a standalone .pdf output of your roofline plot. +If you are only interested in generating roofline analysis data try using `--roof-only`. This will only collect counters relevant to roofline, as well as generate a standalone .pdf output of your roofline plot. Standalone Roofline Options: -- The `--sort` \ allows you to specify whether you'd like to overlay top kernel or top dispatch data in your roofline plot. +- The `--sort` \ allows you to specify whether you would like to overlay top kernel or top dispatch data in your roofline plot. -- The `-m` \ allows you to specify specific level(s) of cache you'd like to include in your roofline plot. +- The `-m` \ allows you to specify specific level(s) of cache you would like to include in your roofline plot. - The `--device` \ allows you to specify a device id to collect performace data from when running our roofline benchmark on your system. -- If you'd like to distinguish different kernels in your .pdf roofline plot use `--kernel-names`. This will give each kernel a unique marker identifiable from the plot's key. +- If you would like to distinguish different kernels in your .pdf roofline plot use `--kernel-names`. This will give each kernel a unique marker identifiable from the plot's key. #### Roofline Only @@ -422,4 +425,4 @@ drwxrwxr-x 2 colramos colramos 4096 Apr 11 17:16 perfmon ``` A sample *empirRoof_gpu-ALL_fp32.pdf* looks something like this: -![Sample Standalone Roof Plot](images/sample-roof-plot.png) \ No newline at end of file +![Sample Standalone Roof Plot](images/sample-roof-plot.png) diff --git a/src/omniperf_analyze/configs/gfx906/0200_system-speed-of-light.yaml b/src/omniperf_analyze/configs/gfx906/0200_system-speed-of-light.yaml index 986b2f0ae..82e59c997 100644 --- a/src/omniperf_analyze/configs/gfx906/0200_system-speed-of-light.yaml +++ b/src/omniperf_analyze/configs/gfx906/0200_system-speed-of-light.yaml @@ -14,10 +14,10 @@ Panel Config: title: Speed-of-Light header: metric: Metric - value: Value + value: Avg unit: Unit peak: Peak - pop: PoP + pop: Pct of Peak tips: Tips metric: VALU FLOPs: @@ -28,7 +28,7 @@ Panel Config: tips: VALU IOPs: value: None # No perf counter - unit: GOPs + unit: GIOPs peak: (((($sclk * $numCU) * 64) * 2) / 1000) pop: None # No perf counter tips: @@ -68,25 +68,37 @@ Panel Config: peak: $numCU pop: ((100 * $numActiveCUs) / $numCU) tips: - SALU Util: + SALU Utilization: value: AVG(((100 * SQ_ACTIVE_INST_SCA) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct peak: 100 pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / (GRBM_GUI_ACTIVE * $numCU))) tips: - VALU Util: + VALU Utilization: value: AVG(((100 * SQ_ACTIVE_INST_VALU) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct peak: 100 pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / (GRBM_GUI_ACTIVE * $numCU))) tips: - MFMA Util: + MFMA Utilization: value: None # No HW module unit: pct peak: 100 pop: None # No HW module tips: - VALU Active Threads/Wave: + VMEM Utilization: + value: None # No HW module + unit: pct + peak: 100 + pop: None # No HW module + tips: + Branch Utilization: + value: None # No HW module + unit: pct + peak: 100 + pop: None # No HW module + tips: + VALU Active Threads: value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU != 0) else None)) unit: Threads @@ -94,25 +106,29 @@ Panel Config: pop: (AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU != 0) else None)) * 1.5625) tips: - IPC - Issue: - value: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) - + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) - / SQ_ACTIVE_INST_ANY)) + IPC: + value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) unit: Instr/cycle peak: 5 - pop: ((100 * AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) - + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) - / SQ_ACTIVE_INST_ANY))) / 5) + pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) + tips: + Wavefront Occupancy: + value: AVG((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE)) + unit: Wavefronts + peak: ($maxWavesPerCU * $numCU) + pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE) / ($maxWavesPerCU + * $numCU)))) + coll_level: SQ_LEVEL_WAVES tips: - LDS BW: + Theoretical LDS Bandwidth: value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) / (EndNs - BeginNs))) - unit: GB/sec + unit: GB/s peak: (($sclk * $numCU) * 0.128) pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) / (EndNs - BeginNs)) / (($sclk * $numCU) * 0.00128))) tips: - LDS Bank Conflict: + LDS Bank Conflicts/Access: value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) unit: Conflicts/access @@ -120,35 +136,7 @@ Panel Config: pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32) tips: - Instr Cache Hit Rate: - value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - unit: pct - peak: 100 - pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - tips: - Instr Cache BW: - value: AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64)) - unit: GB/s - peak: ((($sclk / 1000) * 64) * $numSQC) - pop: ((100 * AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk - / 1000) * 64) * $numSQC)) - tips: - Scalar L1D Cache Hit Rate: - value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - tips: - Scalar L1D Cache BW: - value: AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64)) - unit: GB/s - peak: ((($sclk / 1000) * 64) * $numSQC) - pop: ((100 * AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk - / 1000) * 64) * $numSQC)) - tips: - Vector L1D Cache Hit Rate: + vL1D Cache Hit Rate: value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else @@ -160,7 +148,7 @@ Panel Config: TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else None)) tips: - Vector L1D Cache BW: + vL1D Cache BW: value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))) unit: GB/s peak: ((($sclk / 1000) * 64) * $numCU) @@ -175,6 +163,13 @@ Panel Config: pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else None)) tips: + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 64) / (EndNs - BeginNs))) + unit: GB/s + peak: ((($sclk / 1000) * 64) * TO_INT($L2Banks)) + pop: ((100 * AVG(((TCC_REQ_sum * 64) / (EndNs - BeginNs)))) + / ((($sclk / 1000) * 64) * TO_INT($L2Banks))) + tips: L2-Fabric Read BW: value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64)) / (EndNs - BeginNs))) @@ -195,36 +190,48 @@ Panel Config: value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) unit: Cycles - peak: '' - pop: '' + peak: None + pop: None tips: L2-Fabric Write Latency: value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) unit: Cycles - peak: '' - pop: '' + peak: None + pop: None tips: - Wave Occupancy: - value: AVG((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE)) - unit: Wavefronts - peak: ($maxWavesPerCU * $numCU) - pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE) / ($maxWavesPerCU - * $numCU)))) - coll_level: SQ_LEVEL_WAVES + sL1D Cache Hit Rate: + value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) tips: - Instr Fetch BW: - value: AVG(((SQ_IFETCH / (EndNs - BeginNs)) * 32)) + sL1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64)) unit: GB/s - peak: ((($sclk / 1000) * 32) * $numSQC) - pop: ((100 * AVG(((SQ_IFETCH / (EndNs - BeginNs)) * 32))) / ($numSQC - * (($sclk / 1000) * 32))) - coll_level: SQ_IFETCH_LEVEL + peak: ((($sclk / 1000) * 64) * $numSQC) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk + / 1000) * 64) * $numSQC)) + tips: + L1I Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + tips: + L1I BW: + value: AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 32)) + unit: GB/s + peak: ((($sclk / 1000) * 64) * $numSQC) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 32))) / ((($sclk + / 1000) * 32) * $numSQC)) tips: - Instr Fetch Latency: + L1I Fetch Latency: value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) unit: Cycles - peak: '' - pop: '' + peak: None + pop: None coll_level: SQ_IFETCH_LEVEL tips: diff --git a/src/omniperf_analyze/configs/gfx906/0500_command-processor.yaml b/src/omniperf_analyze/configs/gfx906/0500_command-processor.yaml index 525091879..edd42da6e 100644 --- a/src/omniperf_analyze/configs/gfx906/0500_command-processor.yaml +++ b/src/omniperf_analyze/configs/gfx906/0500_command-processor.yaml @@ -19,19 +19,7 @@ Panel Config: unit: Unit tips: Tips metric: - GPU Busy Cycles: - avg: AVG(GRBM_GUI_ACTIVE) - min: MIN(GRBM_GUI_ACTIVE) - max: MAX(GRBM_GUI_ACTIVE) - unit: Cycles/Kernel - tips: - CPF Busy: - avg: AVG(CPF_CPF_STAT_BUSY) - min: MIN(CPF_CPF_STAT_BUSY) - max: MAX(CPF_CPF_STAT_BUSY) - unit: Cycles/Kernel - tips: - CPF Util: + CPF Utilization: avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) @@ -47,15 +35,9 @@ Panel Config: != 0) else None)) max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY != 0) else None)) - unit: Cycles/Kernel - tips: - L2Cache Intf Busy: - avg: AVG(CPF_CPF_TCIU_BUSY) - min: MIN(CPF_CPF_TCIU_BUSY) - max: MAX(CPF_CPF_TCIU_BUSY) - unit: Cycles/Kernel + unit: pct tips: - L2Cache Intf Util: + CPF-L2 Utilization: avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) @@ -64,7 +46,7 @@ Panel Config: if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) unit: pct tips: - L2Cache Intf Stall: + CPF-L2 Stall: avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY != 0) else None)) min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY @@ -73,11 +55,14 @@ Panel Config: != 0) else None)) unit: pct tips: - UTCL1 Stall: - avg: AVG(CPF_CMP_UTCL1_STALL_ON_TRANSLATION) - min: MIN(CPF_CMP_UTCL1_STALL_ON_TRANSLATION) - max: MAX(CPF_CMP_UTCL1_STALL_ON_TRANSLATION) - unit: Cycles/Kernel + CPF-UTCL1 Stall: + avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None) + min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None) + max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None) + unit: pct tips: - metric_table: @@ -91,19 +76,7 @@ Panel Config: unit: Unit tips: Tips metric: - GPU Busy Cycles: - avg: AVG(GRBM_GUI_ACTIVE) - min: MIN(GRBM_GUI_ACTIVE) - max: MAX(GRBM_GUI_ACTIVE) - unit: Cycles - tips: - CPC Busy Cycles: - avg: AVG(CPC_CPC_STAT_BUSY) - min: MIN(CPC_CPC_STAT_BUSY) - max: MAX(CPC_CPC_STAT_BUSY) - unit: Cycles - tips: - CPC Util: + CPC Utilization: avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) @@ -112,12 +85,6 @@ Panel Config: if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) unit: pct tips: - CPC Stall Cycles: - avg: AVG(CPC_CPC_STAT_STALL) - min: MIN(CPC_CPC_STAT_STALL) - max: MAX(CPC_CPC_STAT_STALL) - unit: Cycles - tips: CPC Stall Rate: avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY != 0) else None)) @@ -127,28 +94,19 @@ Panel Config: != 0) else None)) unit: pct tips: - CPC Packet Decoding: - avg: AVG(CPC_ME1_BUSY_FOR_PACKET_DECODE) - min: MIN(CPC_ME1_BUSY_FOR_PACKET_DECODE) - max: MAX(CPC_ME1_BUSY_FOR_PACKET_DECODE) - unit: Cycles - tips: - SPI Intf Busy Cycles: - avg: AVG(CPC_ME1_DC0_SPI_BUSY) - min: MIN(CPC_ME1_DC0_SPI_BUSY) - max: MAX(CPC_ME1_DC0_SPI_BUSY) - unit: Cycles - tips: - SPI Intf Util: - avg: AVG((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) + CPC Packet Decoding Utilization: + avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) unit: pct tips: - L2Cache Intf Util: + CPC-Workgroup Manager Utilization: + avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) + unit: Pct + tips: + CPC-L2 Utilization: avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) @@ -157,19 +115,16 @@ Panel Config: if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) unit: pct tips: - UTCL1 Stall Cycles: - avg: AVG(CPC_UTCL1_STALL_ON_TRANSLATION) - min: MIN(CPC_UTCL1_STALL_ON_TRANSLATION) - max: MAX(CPC_UTCL1_STALL_ON_TRANSLATION) - unit: Cycles - tips: - UTCL2 Intf Busy Cycles: - avg: AVG(CPC_CPC_UTCL2IU_BUSY) - min: MIN(CPC_CPC_UTCL2IU_BUSY) - max: MAX(CPC_CPC_UTCL2IU_BUSY) - unit: Cycles + CPC-UTCL1 Stall: + avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: pct tips: - UTCL2 Intf Util: + CPC-UTCL2 Utilization: avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) diff --git a/src/omniperf_analyze/configs/gfx906/0600_shader-processor-input.yaml b/src/omniperf_analyze/configs/gfx906/0600_shader-processor-input.yaml index bab48700a..24d4036ec 100644 --- a/src/omniperf_analyze/configs/gfx906/0600_shader-processor-input.yaml +++ b/src/omniperf_analyze/configs/gfx906/0600_shader-processor-input.yaml @@ -6,11 +6,11 @@ Metric Description: # Define the panel properties and properties of each metric in the panel. Panel Config: id: 600 - title: Shader Processor Input (SPI) + title: Workgroup Manager (SPI) data source: - metric_table: id: 601 - title: SPI Stats + title: Workgroup Manager Utilizations header: metric: Metric avg: Avg @@ -19,29 +19,35 @@ Panel Config: unit: Unit tips: Tips metric: - GPU Busy: - avg: AVG(GRBM_GUI_ACTIVE) - min: MIN(GRBM_GUI_ACTIVE) - max: MAX(GRBM_GUI_ACTIVE) - unit: Cycles - tips: - CS Busy: - avg: AVG(SPI_CSN_BUSY) - min: MIN(SPI_CSN_BUSY) - max: MAX(SPI_CSN_BUSY) - unit: Cycles - tips: - SPI Busy: - avg: AVG(GRBM_SPI_BUSY) - min: MIN(GRBM_SPI_BUSY) - max: MAX(GRBM_SPI_BUSY) - unit: Cycles - tips: - SQ Busy: - avg: AVG(SQ_BUSY_CYCLES) - min: MIN(SQ_BUSY_CYCLES) - max: MAX(SQ_BUSY_CYCLES) - unit: Cycles + Accelerator Utilization: + avg: AVG(100 * GRBM_GUI_ACTIVE / GRBM_COUNT) + min: MIN(100 * GRBM_GUI_ACTIVE / GRBM_COUNT) + max: MAX(100 * GRBM_GUI_ACTIVE / GRBM_COUNT) + unit: Pct + tips: + Scheduler-Pipe Utilization: + avg: AVG(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE)) + min: MIN(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE)) + max: MAX(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE)) + unit: Pct + tips: + Workgroup Manager Utilization: + avg: AVG(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE) + min: MIN(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE) + max: MAX(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE) + unit: Pct + tips: + Shader Engine Utilization: + avg: AVG(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE)) + min: MIN(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE)) + max: MAX(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE)) + unit: Pct + tips: + SIMD Utilization: + avg: AVG(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct tips: Dispatched Workgroups: avg: AVG(SPI_CSN_NUM_THREADGROUPS) @@ -55,22 +61,27 @@ Panel Config: max: MAX(SPI_CSN_WAVE) unit: Wavefronts tips: - Wave Alloc Failed: - avg: AVG(SPI_RA_REQ_NO_ALLOC) - min: MIN(SPI_RA_REQ_NO_ALLOC) - max: MAX(SPI_RA_REQ_NO_ALLOC) - unit: Cycles - tips: - Wave Alloc Failed - CS: - avg: AVG(SPI_RA_REQ_NO_ALLOC_CSN) - min: MIN(SPI_RA_REQ_NO_ALLOC_CSN) - max: MAX(SPI_RA_REQ_NO_ALLOC_CSN) - unit: Cycles + VGPR Writes: + avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave tips: - + SGPR Writes: + avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave + tips: - metric_table: id: 602 - title: SPI Resource Allocation + title: Workgroup Manager - Resource Allocation header: metric: Metric avg: Avg @@ -79,96 +90,78 @@ Panel Config: unit: Unit tips: Tips metric: - Wave request Failed (CS): - avg: AVG(SPI_RA_REQ_NO_ALLOC_CSN) - min: MIN(SPI_RA_REQ_NO_ALLOC_CSN) - max: MAX(SPI_RA_REQ_NO_ALLOC_CSN) - unit: Cycles - tips: - CS Stall: - avg: AVG(SPI_RA_RES_STALL_CSN) - min: MIN(SPI_RA_RES_STALL_CSN) - max: MAX(SPI_RA_RES_STALL_CSN) - unit: Cycles - tips: - CS Stall Rate: - avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY != + Not-scheduled Rate (Workgroup Manager): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + unit: Pct + tips: + Not-scheduled Rate (Scheduler-Pipe): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + unit: Pct + tips: + Scheduler-Pipe Stall Rate: + avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)) - min: MIN((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY != + min: MIN((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)) - max: MAX((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY != + max: MAX((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)) - unit: pct + unit: Pct tips: - Scratch Stall: - avg: AVG(SPI_RA_TMP_STALL_CSN) - min: MIN(SPI_RA_TMP_STALL_CSN) - max: MAX(SPI_RA_TMP_STALL_CSN) - unit: Cycles + Scratch Stall Rate: + avg: AVG((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None) + min: MIN((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None) + max: MAX((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None) + unit: Pct tips: Insufficient SIMD Waveslots: - avg: AVG(SPI_RA_WAVE_SIMD_FULL_CSN) - min: MIN(SPI_RA_WAVE_SIMD_FULL_CSN) - max: MAX(SPI_RA_WAVE_SIMD_FULL_CSN) - unit: SIMD + avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct tips: Insufficient SIMD VGPRs: - avg: AVG(SPI_RA_VGPR_SIMD_FULL_CSN) - min: MIN(SPI_RA_VGPR_SIMD_FULL_CSN) - max: MAX(SPI_RA_VGPR_SIMD_FULL_CSN) - unit: SIMD + avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct tips: Insufficient SIMD SGPRs: - avg: AVG(SPI_RA_SGPR_SIMD_FULL_CSN) - min: MIN(SPI_RA_SGPR_SIMD_FULL_CSN) - max: MAX(SPI_RA_SGPR_SIMD_FULL_CSN) - unit: SIMD + avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct tips: Insufficient CU LDS: - avg: AVG(SPI_RA_LDS_CU_FULL_CSN) - min: MIN(SPI_RA_LDS_CU_FULL_CSN) - max: MAX(SPI_RA_LDS_CU_FULL_CSN) - unit: CU - tips: - Insufficient CU Barries: - avg: AVG(SPI_RA_BAR_CU_FULL_CSN) - min: MIN(SPI_RA_BAR_CU_FULL_CSN) - max: MAX(SPI_RA_BAR_CU_FULL_CSN) - unit: CU - tips: - Insufficient Bulky Resource: - avg: AVG(SPI_RA_BULKY_CU_FULL_CSN) - min: MIN(SPI_RA_BULKY_CU_FULL_CSN) - max: MAX(SPI_RA_BULKY_CU_FULL_CSN) - unit: CU - tips: - Reach CU Threadgroups Limit: - avg: AVG(SPI_RA_TGLIM_CU_FULL_CSN) - min: MIN(SPI_RA_TGLIM_CU_FULL_CSN) - max: MAX(SPI_RA_TGLIM_CU_FULL_CSN) - unit: Cycles - tips: - Reach CU Wave Limit: - avg: AVG(SPI_RA_WVLIM_STALL_CSN) - min: MIN(SPI_RA_WVLIM_STALL_CSN) - max: MAX(SPI_RA_WVLIM_STALL_CSN) - unit: Cycles + avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct + tips: + Insufficient CU Barriers: + avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct + tips: + Reached CU Workgroup Limit: + avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct + tips: + Reached CU Wavefront Limit: + avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct tips: - VGPR Writes: - avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - unit: Cycles/wave - tips: - SGPR Writes: - avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - unit: Cycles/wave - tips: diff --git a/src/omniperf_analyze/configs/gfx906/0700_wavefront-launch.yaml b/src/omniperf_analyze/configs/gfx906/0700_wavefront-launch.yaml index 70141193e..abcaae418 100644 --- a/src/omniperf_analyze/configs/gfx906/0700_wavefront-launch.yaml +++ b/src/omniperf_analyze/configs/gfx906/0700_wavefront-launch.yaml @@ -77,7 +77,7 @@ Panel Config: avg: AVG(scr) min: MIN(scr) max: MAX(scr) - unit: Bytes + unit: Bytes/Workitem tips: - metric_table: @@ -103,7 +103,7 @@ Panel Config: max: MAX(GRBM_GUI_ACTIVE) unit: Cycle tips: - Instr/wavefront: + Instructions per wavefront: avg: AVG((SQ_INSTS / SQ_WAVES)) min: MIN((SQ_INSTS / SQ_WAVES)) max: MAX((SQ_INSTS / SQ_WAVES)) diff --git a/src/omniperf_analyze/configs/gfx906/1000_compute-unit-instruction-mix.yaml b/src/omniperf_analyze/configs/gfx906/1000_compute-unit-instruction-mix.yaml index 679acc34d..0092c202c 100644 --- a/src/omniperf_analyze/configs/gfx906/1000_compute-unit-instruction-mix.yaml +++ b/src/omniperf_analyze/configs/gfx906/1000_compute-unit-instruction-mix.yaml @@ -10,7 +10,7 @@ Panel Config: data source: - metric_table: id: 1001 - title: Instruction Mix + title: Overall Instruction Mix header: metric: Metric avg: Avg @@ -22,7 +22,7 @@ Panel Config: type: simple_bar label_txt: (# of instr + $normUnit) metric: - VALU - Vector: + VALU: avg: None # No HW module min: None # No HW module max: None # No HW module @@ -40,7 +40,7 @@ Panel Config: max: MAX((SQ_INSTS_LDS / $denom)) unit: (instr + $normUnit) tips: - VALU - MFMA: + MFMA: avg: None # No HW module min: None # No HW module max: None # No HW module @@ -64,12 +64,6 @@ Panel Config: max: MAX((SQ_INSTS_BRANCH / $denom)) unit: (instr + $normUnit) tips: - GDS: - avg: AVG((SQ_INSTS_GDS / $denom)) - min: MIN((SQ_INSTS_GDS / $denom)) - max: MAX((SQ_INSTS_GDS / $denom)) - unit: (instr + $normUnit) - tips: - metric_table: id: 1002 @@ -103,7 +97,7 @@ Panel Config: max: None # No HW module unit: (instr + $normUnit) tips: - F16-Mult: + F16-MUL: avg: None # No HW module min: None # No HW module max: None # No HW module @@ -127,7 +121,7 @@ Panel Config: max: None # No HW module unit: (instr + $normUnit) tips: - F32-Mult: + F32-MUL: avg: None # No HW module min: None # No HW module max: None # No HW module @@ -151,7 +145,7 @@ Panel Config: max: None # No HW module unit: (instr + $normUnit) tips: - F64-Mult: + F64-MUL: avg: None # No HW module min: None # No HW module max: None # No HW module @@ -180,55 +174,100 @@ Panel Config: id: 1003 title: VMEM Instr Mix header: - type: Type - count: Count + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit tips: Tips metric: - Buffer Instr: - count: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - tips: - Buffer Read: - count: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - tips: - Buffer Write: - count: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - tips: - Buffer Atomic: - count: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - tips: - Flat Instr: - count: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - tips: - Flat Read: - count: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - tips: - Flat Write: - count: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - tips: - Flat Atomic: - count: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - tips: + Global/Generic Instr: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Global/Generic Read: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Global/Generic Write: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Global/Generic Atomic: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Spill/Stack Instr: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Spill/Stack Read: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Spill/Stack Write: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Spill/Stack Atomic: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: - metric_table: id: 1004 title: MFMA Arithmetic Instr Mix header: - type: Type - count: Count + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit tips: Tips metric: MFMA-I8: - count: None # No HW module - tips: + avg: None # No HW module + min: None # No HW module + max: None # No HW module + unit: (instr + $normUnit) + tips: MFMA-F16: - count: None # No HW module - tips: + avg: None # No HW module + min: None # No HW module + max: None # No HW module + unit: (instr + $normUnit) + tips: MFMA-BF16: - count: None # No HW module - tips: + avg: None # No HW module + min: None # No HW module + max: None # No HW module + unit: (instr + $normUnit) + tips: MFMA-F32: - count: None # No HW module + avg: None # No HW module + min: None # No HW module + max: None # No HW module + unit: (instr + $normUnit) tips: MFMA-F64: - count: None # No HW module - tips: + avg: None # No HW module + min: None # No HW module + max: None # No HW module + unit: (instr + $normUnit) + tips: \ No newline at end of file diff --git a/src/omniperf_analyze/configs/gfx906/1100_compute-unit-compute-pipeline.yaml b/src/omniperf_analyze/configs/gfx906/1100_compute-unit-compute-pipeline.yaml index 4ea952637..63019bfec 100644 --- a/src/omniperf_analyze/configs/gfx906/1100_compute-unit-compute-pipeline.yaml +++ b/src/omniperf_analyze/configs/gfx906/1100_compute-unit-compute-pipeline.yaml @@ -13,7 +13,10 @@ Panel Config: title: Speed-of-Light header: metric: Metric - value: Value + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak tips: Tips style: type: simple_bar @@ -21,23 +24,47 @@ Panel Config: label_txt: (%) xrange: [0, 110] metric: - valu_flops_pop: + VALU FLOPs: + value: None # No perf counter + Unit: None + peak: None + pop: None + tips: + VALU IOPs: value: None # No perf counter + Unit: None + peak: None + pop: None tips: - mfma_flops_bf16_pop: + MFMA FLOPs (BF16): value: None # No perf counter + Unit: None + peak: None + pop: None tips: - mfma_flops_f16_pop: + MFMA FLOPs (F16): value: None # No perf counter + Unit: None + peak: None + pop: None tips: - mfma_flops_f32_pop: + MFMA FLOPs (F32): value: None # No perf counter + Unit: None + peak: None + pop: None tips: - mfma_flops_f64_pop: + MFMA FLOPs (F64): value: None # No perf counter + Unit: None + peak: None + pop: None tips: - mfma_flops_i8_pop: + MFMA IOPs (INT8): value: None # No perf counter + Unit: None + peak: None + pop: None tips: - metric_table: @@ -51,36 +78,48 @@ Panel Config: unit: Unit tips: Tips metric: - IPC (Avg): + IPC: avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) unit: Instr/cycle tips: - IPC (Issue): - avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) - + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) + IPC (Issued): + avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) / SQ_ACTIVE_INST_ANY)) - min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) - + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) + min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) / SQ_ACTIVE_INST_ANY)) - max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) - + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) + max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) / SQ_ACTIVE_INST_ANY)) unit: Instr/cycle tips: - SALU Util: + SALU Utilization: avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU)) min: MIN((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU)) max: MAX((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU)) unit: pct tips: - VALU Util: + VALU Utilization: avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU)) min: MIN((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU)) max: MAX((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU)) unit: pct tips: + VMEM Utilization: + avg: None # No HW module + min: None # No HW module + max: None # No HW module + unit: pct + tips: + Branch Utilization: + avg: None # No HW module + min: None # No HW module + max: None # No HW module + unit: pct + tips: VALU Active Threads: avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU != 0) else None)) @@ -90,7 +129,7 @@ Panel Config: != 0) else None)) unit: Threads tips: - MFMA Util: + MFMA Utilization: avg: None # No HW module min: None # No HW module max: None # No HW module @@ -102,6 +141,20 @@ Panel Config: max: None # No HW module unit: cycles/instr tips: + VMEM Latency: + avg: None # No perf counter + min: None # No perf counter + max: None # No perf counter + unit: Cycles + coll_level: SQ_INST_LEVEL_VMEM + tips: + SMEM Latency: + avg: None # No perf counter + min: None # No perf counter + max: None # No perf counter + unit: Cycles + coll_level: SQ_INST_LEVEL_SMEM + tips: - metric_table: id: 1103 @@ -120,7 +173,7 @@ Panel Config: max: None # No perf counter unit: (OPs + $normUnit) tips: - INT8 OPs: + IOPs (Total): avg: None # No perf counter min: None # No perf counter max: None # No perf counter @@ -150,41 +203,9 @@ Panel Config: max: None # No perf counter unit: (OPs + $normUnit) tips: - - - metric_table: - id: 1104 - title: Memory Latencies - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - VMEM Latency: - avg: None # No perf counter - min: None # No perf counter - max: None # No perf counter - unit: Cycles - tips: SQ_INSTS_LEVEL_VMEM - SMEM Latency: - avg: None # No perf counter - min: None # No perf counter - max: None # No perf counter - unit: Cycles - tips: SQ_INSTS_LEVEL_SMEM - Instr Fetch Latency: - avg: None # No perf counter - min: None # No perf counter - max: None # No perf counter - unit: Cycles - tips: SQ_IFETCH_LEVEL - LDS Latency: + INT8 OPs: avg: None # No perf counter min: None # No perf counter max: None # No perf counter - unit: Cycles - tips: SQ_INST_LEVEL_LDS - - + unit: (OPs + $normUnit) + tips: diff --git a/src/omniperf_analyze/configs/gfx906/1200_lds.yaml b/src/omniperf_analyze/configs/gfx906/1200_lds.yaml index 3fd52c3b1..8e40452dc 100644 --- a/src/omniperf_analyze/configs/gfx906/1200_lds.yaml +++ b/src/omniperf_analyze/configs/gfx906/1200_lds.yaml @@ -26,20 +26,24 @@ Panel Config: value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / (GRBM_GUI_ACTIVE * $numCU))) unit: Pct of Peak tips: + unit: pct Access Rate: value: AVG(((200 * SQ_ACTIVE_INST_LDS) / (GRBM_GUI_ACTIVE * $numCU))) unit: Pct of Peak tips: - Bandwidth (Pct-of-Peak): + unit: pct + Theoretical Bandwidth (% of Peak): value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) / (EndNs - BeginNs)) / (($sclk * $numCU) * 0.00128))) unit: Pct of Peak tips: + unit: pct Bank Conflict Rate: value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) unit: Pct of Peak tips: + unit: pct - metric_table: id: 1202 @@ -58,7 +62,7 @@ Panel Config: max: MAX((SQ_INSTS_LDS / $denom)) unit: (Instr + $normUnit) tips: - Bandwidth: + Theoretical Bandwidth: avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) / $denom)) min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) @@ -67,7 +71,14 @@ Panel Config: / $denom)) unit: (Bytes + $normUnit) tips: - Bank Conficts/Access: + LDS Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_LDS + tips: + Bank Conflicts/Access: avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) @@ -82,7 +93,7 @@ Panel Config: max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) unit: (Cycles + $normUnit) tips: - Atomic Cycles: + Atomic Return Cycles: avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) @@ -110,12 +121,5 @@ Panel Config: avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) - unit: ( + $normUnit) - tips: - LDS Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_LDS - tips: + unit: (Accesses + $normUnit) + tips: \ No newline at end of file diff --git a/src/omniperf_analyze/configs/gfx906/1300_instruction-cache.yaml b/src/omniperf_analyze/configs/gfx906/1300_instruction-cache.yaml index 05dc75980..555bc714a 100644 --- a/src/omniperf_analyze/configs/gfx906/1300_instruction-cache.yaml +++ b/src/omniperf_analyze/configs/gfx906/1300_instruction-cache.yaml @@ -13,7 +13,7 @@ Panel Config: title: Speed-of-Light header: metric: Metric - value: Value + value: Avg unit: Unit tips: Tips style: @@ -27,11 +27,16 @@ Panel Config: * (EndNs - BeginNs)))) unit: Pct of Peak tips: - Cache Hit: + Cache Hit Rate: value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + SQC_ICACHE_MISSES_DUPLICATE))) unit: Pct of Peak tips: + L1I-L2 Bandwidth: + value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($sclk * $numSQC) + * (EndNs - BeginNs)))) + unit: Pct of Peak + tips: - metric_table: id: 1302 @@ -68,7 +73,7 @@ Panel Config: max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) unit: (Misses + $normUnit) tips: - Cache Hit: + Cache Hit Rate: avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + SQC_ICACHE_MISSES_DUPLICATE))) min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + @@ -77,3 +82,27 @@ Panel Config: SQC_ICACHE_MISSES_DUPLICATE))) unit: pct tips: + Instruction Fetch Latency: + avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + coll_level: SQ_IFETCH_LEVEL + tips: + - metric_table: + id: 1303 + title: Instruction Cache - L2 Interface + header: + metric: Metric + mean: Mean + min: Min + max: Max + unit: Unit + tips: Tips + metric: + L1I-L2 Bandwidth: + mean: AVG(((SQC_TC_INST_REQ * 64) / $denom)) + min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) + max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) + unit: (Bytes + $normUnit) + tips: \ No newline at end of file diff --git a/src/omniperf_analyze/configs/gfx906/1400_constant-cache.yaml b/src/omniperf_analyze/configs/gfx906/1400_constant-cache.yaml index 563caad13..0a2bc4b57 100644 --- a/src/omniperf_analyze/configs/gfx906/1400_constant-cache.yaml +++ b/src/omniperf_analyze/configs/gfx906/1400_constant-cache.yaml @@ -12,8 +12,8 @@ Panel Config: id: 1401 title: Speed-of-Light header: - mertic: Metric - value: Value + metric: Metric + value: Avg unit: Unit tips: Tips style: @@ -27,12 +27,17 @@ Panel Config: * (EndNs - BeginNs)))) unit: Pct of Peak tips: - Cache Hit: + Cache Hit Rate: value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) unit: Pct of Peak tips: + sL1D-L2 BW: + value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000) + / (2 * ($sclk * $numSQC) * (EndNs - BeginNs))) + unit: Pct of Peak + tips: - metric_table: id: 1402 @@ -138,6 +143,12 @@ Panel Config: unit: Unit tips: Tips metric: + sL1D-L2 BW: + mean: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) + min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) + max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) + unit: (Bytes + $normUnit) + tips: Read Req: avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) min: MIN((SQC_TC_DATA_READ_REQ / $denom)) diff --git a/src/omniperf_analyze/configs/gfx906/1500_TA_and_TD.yaml b/src/omniperf_analyze/configs/gfx906/1500_TA_and_TD.yaml index 8f71cedc9..773bb7c76 100644 --- a/src/omniperf_analyze/configs/gfx906/1500_TA_and_TD.yaml +++ b/src/omniperf_analyze/configs/gfx906/1500_TA_and_TD.yaml @@ -6,11 +6,11 @@ Metric Description: # Define the panel properties and properties of each metric in the panel. Panel Config: id: 1500 - title: Texture Addresser and Texture Data (TA/TD) + title: Address Processing Unit and Data Return Path (TA/TD) data source: - metric_table: id: 1501 - title: TA + title: Address Processing Unit header: metric: Metric avg: Avg @@ -19,25 +19,25 @@ Panel Config: unit: Unit tips: Tips metric: - TA Busy: + Address Processing Unit Busy: avg: AVG(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct tips: - TC2TA Addr Stall: + Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct tips: - TC2TA Data Stall: + Data Stall: avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct tips: - TD2TA Addr Stall: + Data-Processor → Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) @@ -47,69 +47,69 @@ Panel Config: avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Flat Instr: + Global/Generic Instructions: avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Flat Read Instr: + Global/Generic Read Instructions: avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Flat Write Instr: + Global/Generic Write Instructions: avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Flat Atomic Instr: + Global/Generic Atomic Instructions: avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Buffer Instr: + Spill/Stack Instructions: avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Buffer Read Instr: + Spill/Stack Read Instructions: avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Buffer Write Instr: + Spill/Stack Write Instructions: avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Buffer Atomic Instr: + Spill/Stack Atomic Instructions: avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Buffer Total Cylces: + Spill/Stack Total Cycles: avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) tips: - Buffer Coalesced Read: + Spill/Stack Coalesced Read: avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) tips: - Buffer Coalesced Write: + Spill/Stack Coalesced Write: avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) @@ -118,7 +118,7 @@ Panel Config: - metric_table: id: 1502 - title: TD + title: Data-Return Path header: metric: Metric avg: Avg @@ -127,48 +127,48 @@ Panel Config: unit: Unit tips: Tips metric: - TD Busy: + Data-Return Busy: avg: AVG(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct tips: - TC2TD Stall: + Cache RAM → Data-Return Stall: avg: AVG(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct tips: - SPI2TD Stall: + Workgroup manager → Data-Return Stall: avg: # No perf counter min: # No perf counter max: # No perf counter unit: pct tips: - Coalescable Instr: + Coalescable Instructions: avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Load Instr: + Read Instructions: avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Store Instr: + Write Instructions: avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Atomic Instr: + Atomic Instructions: avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: diff --git a/src/omniperf_analyze/configs/gfx906/1600_L1_cache.yaml b/src/omniperf_analyze/configs/gfx906/1600_L1_cache.yaml index 01e6d29d7..66f6a5e3d 100644 --- a/src/omniperf_analyze/configs/gfx906/1600_L1_cache.yaml +++ b/src/omniperf_analyze/configs/gfx906/1600_L1_cache.yaml @@ -13,7 +13,7 @@ Panel Config: title: Speed-of-Light header: metric: Metric - value: Value + value: Avg unit: Unit tips: Tips style: @@ -22,26 +22,26 @@ Panel Config: label_txt: (%) xrange: [0, 110] metric: - Buffer Coalescing: - value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) - unit: Pct of Peak - tips: - Cache Util: - value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None)) + Hit rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) unit: Pct of Peak tips: - Cache BW: + Bandwidth: value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs)))) / ((($sclk / 1000) * 64) * $numCU)) unit: Pct of Peak tips: - Cache Hit: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) + Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + tips: + Coalescing: + value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) unit: Pct of Peak tips: @@ -141,11 +141,26 @@ Panel Config: unit: (Req + $normUnit) tips: Cache BW: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))) - unit: GB/s + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + unit: (Bytes + $normUnit) tips: + Cache Hit Rate: + avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / + TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / + TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / + TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + unit: pct + tips: Cache Accesses: avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) @@ -164,22 +179,7 @@ Panel Config: / $denom)) unit: (Req + $normUnit) tips: - Cache Hit Rate: - avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: pct - tips: - Invalidate: + Invalidations: avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) @@ -188,9 +188,9 @@ Panel Config: L1-L2 BW: avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - min: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - max: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) unit: (Bytes + $normUnit) tips: @@ -388,17 +388,17 @@ Panel Config: avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - units: (Hits + $normUnit) + units: (Req + $normUnit) tips: - Misses (Translation): + Translation Misses: avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - units: (Misses + $normUnit) + units: (Req + $normUnit) tips: - Misses (Permission): + Permission Misses: avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - units: (Misses + $normUnit) + units: (Req + $normUnit) tips: diff --git a/src/omniperf_analyze/configs/gfx906/1700_L2_cache.yaml b/src/omniperf_analyze/configs/gfx906/1700_L2_cache.yaml index 0b5f5e827..8cc5cf53b 100644 --- a/src/omniperf_analyze/configs/gfx906/1700_L2_cache.yaml +++ b/src/omniperf_analyze/configs/gfx906/1700_L2_cache.yaml @@ -13,31 +13,35 @@ Panel Config: title: Speed-of-Light header: metric: Metric - value: Value + value: Avg unit: Unit tips: Tips style: type: simple_bar metric: - L2 Util: + Utilization: value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($L2Banks) * GRBM_GUI_ACTIVE))) unit: pct + tips: + Bandwidth: + value: ((100 * AVG(((TCC_REQ_sum * 64) / (EndNs - BeginNs)))) / ((($sclk / 1000) * 64) * TO_INT($L2Banks))) + unit: pct tips: - Cache Hit: + Hit Rate: value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)) unit: pct - tips: - L2-EA Rd BW: + tips: + L2-Fabric Read BW: value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64)) / (EndNs - BeginNs))) unit: GB/s - tips: - L2-EA Wr BW: + tips: + L2-Fabric Write and Atomic BW: value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)) / (EndNs - BeginNs))) unit: GB/s - tips: + tips: - metric_table: id: 1702 @@ -50,7 +54,7 @@ Panel Config: unit: Unit tips: Tips metric: - Read BW: + L2-Fabric Read BW: avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64)) / $denom)) min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) @@ -58,8 +62,26 @@ Panel Config: max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64)) / $denom)) unit: (Bytes + $normUnit) - tips: - Write BW: + tips: + HBM Read Traffic: + avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + unit: pct + tips: + Remote Read Traffic: + avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + unit: pct + tips: + Uncached Read Traffic: + avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + unit: pct + tips: + L2-Fabric Write and Atomic BW: avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)) / $denom)) min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) @@ -67,55 +89,31 @@ Panel Config: max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)) / $denom)) unit: (Bytes + $normUnit) - tips: - Read (32B): - avg: AVG((TCC_EA_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA_RDREQ_32B_sum / $denom)) - max: MAX((TCC_EA_RDREQ_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read (Uncached 32B): - avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read (64B): - avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - unit: (Req + $normUnit) - tips: - HBM Read: - avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write (32B): - avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - unit: (Req + $normUnit) - tips: - Write (Uncached 32B): - avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write (64B): - avg: AVG((TCC_EA_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA_WRREQ_64B_sum / $denom)) - max: MAX((TCC_EA_WRREQ_64B_sum / $denom)) - unit: (Req + $normUnit) - tips: - HBM Write: - avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: + tips: + HBM Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + unit: pct + tips: + Remote Write and Atomic Traffic: + avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + unit: pct + tips: + Atomic Traffic: + avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + unit: pct + tips: + Uncached Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + unit: pct + tips: Read Latency: avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) @@ -124,7 +122,7 @@ Panel Config: max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) unit: Cycles - tips: + tips: Write Latency: avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) @@ -133,7 +131,7 @@ Panel Config: max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) unit: Cycles - tips: + tips: Atomic Latency: avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum != 0) else None)) @@ -142,7 +140,7 @@ Panel Config: max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum != 0) else None)) unit: Cycles - tips: + tips: Read Stall: avg: AVG((((100 * ((TCC_EA_RDREQ_IO_CREDIT_STALL_sum + TCC_EA_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != @@ -154,7 +152,7 @@ Panel Config: + TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) unit: pct - tips: + tips: Write Stall: avg: AVG((((100 * ((TCC_EA_WRREQ_IO_CREDIT_STALL_sum + TCC_EA_WRREQ_GMI_CREDIT_STALL_sum) + TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != @@ -166,7 +164,7 @@ Panel Config: + TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) unit: pct - tips: + tips: - metric_table: id: 1703 @@ -179,121 +177,127 @@ Panel Config: unit: Unit tips: Tips metric: + Bandwidth: + avg: AVG((TCC_REQ_sum * 64) / $denom) + min: MIN((TCC_REQ_sum * 64) / $denom) + max: MAX((TCC_REQ_sum * 64) / $denom) + unit: (Bytes + $normUnit) + tips: Req: avg: AVG((TCC_REQ_sum / $denom)) min: MIN((TCC_REQ_sum / $denom)) max: MAX((TCC_REQ_sum / $denom)) unit: (Req + $normUnit) - tips: - Streaming Req: - avg: AVG((TCC_STREAMING_REQ_sum / $denom)) - min: MIN((TCC_STREAMING_REQ_sum / $denom)) - max: MAX((TCC_STREAMING_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: + tips: Read Req: avg: AVG((TCC_READ_sum / $denom)) min: MIN((TCC_READ_sum / $denom)) max: MAX((TCC_READ_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: Write Req: avg: AVG((TCC_WRITE_sum / $denom)) min: MIN((TCC_WRITE_sum / $denom)) max: MAX((TCC_WRITE_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: Atomic Req: avg: AVG((TCC_ATOMIC_sum / $denom)) min: MIN((TCC_ATOMIC_sum / $denom)) max: MAX((TCC_ATOMIC_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: + Streaming Req: + avg: AVG((TCC_STREAMING_REQ_sum / $denom)) + min: MIN((TCC_STREAMING_REQ_sum / $denom)) + max: MAX((TCC_STREAMING_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: Probe Req: avg: AVG((TCC_PROBE_sum / $denom)) min: MIN((TCC_PROBE_sum / $denom)) max: MAX((TCC_PROBE_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: + Cache Hit: + avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + tips: Hits: avg: AVG((TCC_HIT_sum / $denom)) min: MIN((TCC_HIT_sum / $denom)) max: MAX((TCC_HIT_sum / $denom)) unit: (Hits + $normUnit) - tips: + tips: Misses: avg: AVG((TCC_MISS_sum / $denom)) min: MIN((TCC_MISS_sum / $denom)) max: MAX((TCC_MISS_sum / $denom)) unit: (Misses + $normUnit) - tips: - Cache Hit: - avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - tips: + tips: Writeback: avg: AVG((TCC_WRITEBACK_sum / $denom)) min: MIN((TCC_WRITEBACK_sum / $denom)) max: MAX((TCC_WRITEBACK_sum / $denom)) - unit: ( + $normUnit) - tips: + unit: (Cachelines + $normUnit) + tips: + Writeback (Internal): + avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) + min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) + max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + tips: + Writeback (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + tips: + Evict (Normal): + avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) + min: MIN((TCC_NORMAL_EVICT_sum / $denom)) + max: MAX((TCC_NORMAL_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + tips: + Evict (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + tips: NC Req: avg: AVG((TCC_NC_REQ_sum / $denom)) min: MIN((TCC_NC_REQ_sum / $denom)) max: MAX((TCC_NC_REQ_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: UC Req: avg: AVG((TCC_UC_REQ_sum / $denom)) min: MIN((TCC_UC_REQ_sum / $denom)) max: MAX((TCC_UC_REQ_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: CC Req: avg: AVG((TCC_CC_REQ_sum / $denom)) min: MIN((TCC_CC_REQ_sum / $denom)) max: MAX((TCC_CC_REQ_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: RW Req: avg: None # No HW module min: None # No HW module max: None # No HW module unit: (Req + $normUnit) - tips: - Writeback (Normal): - avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) - min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) - max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) - unit: ( + $normUnit) - tips: - Writeback (TC Req): - avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - unit: ( + $normUnit) - tips: - Evict (Normal): - avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) - min: MIN((TCC_NORMAL_EVICT_sum / $denom)) - max: MAX((TCC_NORMAL_EVICT_sum / $denom)) - unit: ( + $normUnit) - tips: - Evict (TC Req): - avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - unit: ( + $normUnit) - tips: + tips: - metric_table: id: 1704 - title: L2 - EA Interface Stalls + title: L2 - Fabric Interface Stalls header: metric: Metric type: Type @@ -306,59 +310,137 @@ Panel Config: style: type: simple_multi_bar metric: - Read - Remote Socket Stall: - type: Remote Socket Stall + Read - PCIe Stall: + type: PCIe Stall transaction: Read - avg: AVG((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read - Peer GCD Stall: - type: Peer GCD Stall + avg: AVG(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + Read - Infinity Fabric™ Stall: + type: Infinity Fabric™ Stall transaction: Read - avg: AVG((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: + avg: AVG(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: Read - HBM Stall: type: HBM Stall transaction: Read - avg: AVG((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write - Remote Socket Stall: - type: Remote Socket Stall + avg: AVG(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + Write - PCIe Stall: + type: PCIe Stall transaction: Write - avg: AVG((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write - Peer GCD Stall: - type: Peer GCD Stall + avg: AVG(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + Write - Infinity Fabric™ Stall: + type: Infinity Fabric™ Stall transaction: Write - avg: AVG((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: + avg: AVG(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: Write - HBM Stall: type: HBM Stall transaction: Write - avg: AVG((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: + avg: AVG(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: Write - Credit Starvation: type: Credit Starvation transaction: Write - avg: AVG((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom)) - min: MIN((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom)) - max: MAX((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom)) + avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + + - metric_table: + id: 1705 + title: L2 - Fabric Detailed Transaction Breakdown + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Read (32B): + avg: AVG((TCC_EA_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA_RDREQ_32B_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: + Read (Uncached): + avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + tips: + Read (64B): + avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + unit: (Req + $normUnit) + tips: + HBM Read: + avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + tips: + Remote Read: + avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + tips: + Write and Atomic (32B): + avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + unit: (Req + $normUnit) + tips: + Write and Atomic (Uncached): + avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + tips: + Write and Atomic (64B): + avg: AVG((TCC_EA_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA_WRREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + tips: + HBM Write and Atomic: + avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + tips: + Remote Write and Atomic: + avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + tips: + Atomic: + avg: AVG((TCC_EA_ATOMIC_sum / $denom)) + min: MIN((TCC_EA_ATOMIC_sum / $denom)) + max: MAX((TCC_EA_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + tips: \ No newline at end of file diff --git a/src/omniperf_analyze/configs/gfx906/1800_L2_cache_per_channel.yaml b/src/omniperf_analyze/configs/gfx906/1800_L2_cache_per_channel.yaml index 7a808c5b8..c7d1851e7 100644 --- a/src/omniperf_analyze/configs/gfx906/1800_L2_cache_per_channel.yaml +++ b/src/omniperf_analyze/configs/gfx906/1800_L2_cache_per_channel.yaml @@ -13,7 +13,7 @@ Panel Config: title: Aggregate Stats (All 32 channels) header: metric: Metric - avg: Mean + avg: Avg std dev: Std Dev min: Min max: Max @@ -167,7 +167,7 @@ Panel Config: + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) unit: pct tips: - Req: + L2 Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_REQ[0]) + TO_INT(TCC_REQ[1])) + TO_INT(TCC_REQ[2])) + TO_INT(TCC_REQ[3])) + TO_INT(TCC_REQ[4])) + TO_INT(TCC_REQ[5])) + TO_INT(TCC_REQ[6])) + TO_INT(TCC_REQ[7])) + TO_INT(TCC_REQ[8])) + TO_INT(TCC_REQ[9])) @@ -206,7 +206,7 @@ Panel Config: + TO_INT(TCC_REQ[30])) + TO_INT(TCC_REQ[31])) / 32) / $denom)) unit: (Req + $normUnit) tips: - L1 - L2 Read Req: + L2 Read Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_READ[0]) + TO_INT(TCC_READ[1])) + TO_INT(TCC_READ[2])) + TO_INT(TCC_READ[3])) + TO_INT(TCC_READ[4])) + TO_INT(TCC_READ[5])) + TO_INT(TCC_READ[6])) + TO_INT(TCC_READ[7])) + TO_INT(TCC_READ[8])) + TO_INT(TCC_READ[9])) @@ -249,7 +249,7 @@ Panel Config: + TO_INT(TCC_READ[31])) / 32) / $denom)) unit: (Req + $normUnit) tips: - L1 - L2 Write Req: + L2 Write Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_WRITE[0]) + TO_INT(TCC_WRITE[1])) + TO_INT(TCC_WRITE[2])) + TO_INT(TCC_WRITE[3])) + TO_INT(TCC_WRITE[4])) + TO_INT(TCC_WRITE[5])) + TO_INT(TCC_WRITE[6])) + TO_INT(TCC_WRITE[7])) + TO_INT(TCC_WRITE[8])) @@ -296,7 +296,7 @@ Panel Config: + TO_INT(TCC_WRITE[30])) + TO_INT(TCC_WRITE[31])) / 32) / $denom)) unit: (Req + $normUnit) tips: - L1 - L2 Atomic Req: + L2 Atomic Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_ATOMIC[0]) + TO_INT(TCC_ATOMIC[1])) + TO_INT(TCC_ATOMIC[2])) + TO_INT(TCC_ATOMIC[3])) + TO_INT(TCC_ATOMIC[4])) + TO_INT(TCC_ATOMIC[5])) + TO_INT(TCC_ATOMIC[6])) + TO_INT(TCC_ATOMIC[7])) @@ -347,7 +347,7 @@ Panel Config: / 32) / $denom)) unit: (Req + $normUnit) tips: - L2 - EA Read Req: + L2 - Fabric Read Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ[0]) + TO_INT(TCC_EA_RDREQ[1])) + TO_INT(TCC_EA_RDREQ[2])) + TO_INT(TCC_EA_RDREQ[3])) + TO_INT(TCC_EA_RDREQ[4])) + TO_INT(TCC_EA_RDREQ[5])) + TO_INT(TCC_EA_RDREQ[6])) + TO_INT(TCC_EA_RDREQ[7])) @@ -398,7 +398,7 @@ Panel Config: / 32) / $denom)) unit: (Req + $normUnit) tips: - L2 - EA Write Req: + L2 - Fabric Write Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ[0]) + TO_INT(TCC_EA_WRREQ[1])) + TO_INT(TCC_EA_WRREQ[2])) + TO_INT(TCC_EA_WRREQ[3])) + TO_INT(TCC_EA_WRREQ[4])) + TO_INT(TCC_EA_WRREQ[5])) + TO_INT(TCC_EA_WRREQ[6])) + TO_INT(TCC_EA_WRREQ[7])) @@ -449,7 +449,7 @@ Panel Config: / 32) / $denom)) unit: (Req + $normUnit) tips: - L2 - EA Atomic Req: + L2 - Fabric Atomic Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_ATOMIC[0]) + TO_INT(TCC_EA_ATOMIC[1])) + TO_INT(TCC_EA_ATOMIC[2])) + TO_INT(TCC_EA_ATOMIC[3])) + TO_INT(TCC_EA_ATOMIC[4])) + TO_INT(TCC_EA_ATOMIC[5])) + TO_INT(TCC_EA_ATOMIC[6])) + TO_INT(TCC_EA_ATOMIC[7])) @@ -500,7 +500,7 @@ Panel Config: / 32) / $denom)) unit: (Req + $normUnit) tips: - L2 - EA Read Lat: + L2 - Fabric Read Lat: avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_RDREQ_LEVEL[0] + TCC_EA_RDREQ_LEVEL[1]) + TCC_EA_RDREQ_LEVEL[2]) + TCC_EA_RDREQ_LEVEL[3]) + TCC_EA_RDREQ_LEVEL[4]) + TCC_EA_RDREQ_LEVEL[5]) + TCC_EA_RDREQ_LEVEL[6]) + TCC_EA_RDREQ_LEVEL[7]) @@ -615,7 +615,7 @@ Panel Config: + TCC_EA_RDREQ[29]) + TCC_EA_RDREQ[30]) + TCC_EA_RDREQ[31]) != 0) else None)) unit: Cycles tips: - L2 - EA Write Lat: + L2 - Fabric Write Lat: avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_WRREQ_LEVEL[0] + TCC_EA_WRREQ_LEVEL[1]) + TCC_EA_WRREQ_LEVEL[2]) + TCC_EA_WRREQ_LEVEL[3]) + TCC_EA_WRREQ_LEVEL[4]) + TCC_EA_WRREQ_LEVEL[5]) + TCC_EA_WRREQ_LEVEL[6]) + TCC_EA_WRREQ_LEVEL[7]) @@ -730,7 +730,7 @@ Panel Config: + TCC_EA_WRREQ[29]) + TCC_EA_WRREQ[30]) + TCC_EA_WRREQ[31]) != 0) else None)) unit: Cycles tips: - L2 - EA Atomic Lat: + L2 - Fabric Atomic Lat: avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_ATOMIC_LEVEL[0] + TCC_EA_ATOMIC_LEVEL[1]) + TCC_EA_ATOMIC_LEVEL[2]) + TCC_EA_ATOMIC_LEVEL[3]) + TCC_EA_ATOMIC_LEVEL[4]) + TCC_EA_ATOMIC_LEVEL[5]) + TCC_EA_ATOMIC_LEVEL[6]) + TCC_EA_ATOMIC_LEVEL[7]) @@ -849,49 +849,49 @@ Panel Config: None)) unit: Cycles tips: - L2 - EA Read Stall (IO): + L2 - Fabric Read Stall (PCIe): avg: None # No perf counter std dev: None # No perf counter min: None # No perf counter max: None # No perf counter unit: (Cycles + $normUnit) tips: - L2 - EA Read Stall (GMI): + L2 - Fabric Read Stall (Infinity Fabric™): avg: None # No perf counter std dev: None # No perf counter min: None # No perf counter max: None # No perf counter unit: (Cycles + $normUnit) tips: - L2 - EA Read Stall (DRAM): + L2 - Fabric Read Stall (HBM): avg: None # No perf counter std dev: None # No perf counter min: None # No perf counter max: None # No perf counter unit: (Cycles + $normUnit) tips: - L2 - EA Write Stall (IO): + L2 - Fabric Write Stall (PCIe): avg: None # No perf counter std dev: None # No perf counter min: None # No perf counter max: None # No perf counter unit: (Cycles + $normUnit) tips: - L2 - EA Write Stall (GMI): + L2 - Fabric Write Stall (Infinity Fabric™): avg: None # No perf counter std dev: None # No perf counter min: None # No perf counter max: None # No perf counter unit: (Cycles + $normUnit) tips: - L2 - EA Write Stall (DRAM): + L2 - Fabric Write Stall (HBM): avg: None # No perf counter std dev: None # No perf counter min: None # No perf counter max: None # No perf counter unit: (Cycles + $normUnit) tips: - L2 - EA Write Starve: + L2 - Fabric Write Starve: avg: None # No perf counter std dev: None # No perf counter min: None # No perf counter @@ -906,22 +906,22 @@ Panel Config: channel: Channel hit rate: L2 Cache Hit Rate (%) req: Requests (Requests) - read req: L1-L2 Read (Requests) - write req: L1-L2 Write (Requests) - atomic req: L1-L2 Atomic (Requests) - ea read req: L2-EA Read (Requests) - ea write req: L2-EA Write (Requests) - ea atomic req: L2-EA Atomic (Requests) - ea read lat - cycles: L2-EA Read Latency (Cycles) - ea write lat - cycles: L2-EA Write Latency (Cycles) - ea atomic lat - cycles: L2-EA Atomic Latency (Cycles) - ea read stall - io: L2-EA Read Stall - IO (Cycles per) - ea read stall - gmi: L2-EA Read Stall - GMI (Cycles per) - ea read stall - dram: L2-EA Read Stall - DRAM (Cycles per) - ea write stall - io: L2-EA Write Stall - IO (Cycles per) - ea write stall - gmi: L2-EA Write Stall - GMI (Cycles per) - ea write stall - dram: L2-EA Write Stall - DRAM (Cycles per) - ea write stall - starve: L2-EA Write Stall - Starve (Cycles per) + read req: L2 Read (Requests) + write req: L2 Write (Requests) + atomic req: L2 Atomic (Requests) + ea read req: L2-Fabric Read (Requests) + ea write req: L2-Fabric Write and Atomic (Requests) + ea atomic req: L2-Fabric Atomic (Requests) + ea read lat - cycles: L2-Fabric Read Latency (Cycles) + ea write lat - cycles: L2-Fabric Write Latency (Cycles) + ea atomic lat - cycles: L2-Fabric Atomic Latency (Cycles) + ea read stall - io: L2-Fabric Read Stall - PCIe (Cycles per) + ea read stall - gmi: L2-Fabric Read Stall - Infinity Fabric™ (Cycles per) + ea read stall - dram: L2-Fabric Read Stall - HBM (Cycles per) + ea write stall - io: L2-Fabric Write Stall - PCIe (Cycles per) + ea write stall - gmi: L2-Fabric Write Stall - Infinity Fabric™ (Cycles per) + ea write stall - dram: L2-Fabric Write Stall - HBM (Cycles per) + ea write stall - starve: L2-Fabric Write Stall - Starve (Cycles per) tips: Tips metric: "0": @@ -1381,22 +1381,22 @@ Panel Config: channel: Channel hit rate: L2 Cache Hit Rate (%) req: Requests (Requests) - read req: L1-L2 Read (Requests) - write req: L1-L2 Write (Requests) - atomic req: L1-L2 Atomic (Requests) - ea read req: L2-EA Read (Requests) - ea write req: L2-EA Write (Requests) - ea atomic req: L2-EA Atomic (Requests) - ea read lat - cycles: L2-EA Read Latency (Cycles) - ea write lat - cycles: L2-EA Write Latency (Cycles) - ea atomic lat - cycles: L2-EA Atomic Latency (Cycles) - ea read stall - io: L2-EA Read Stall - IO (Cycles per) - ea read stall - gmi: L2-EA Read Stall - GMI (Cycles per) - ea read stall - dram: L2-EA Read Stall - DRAM (Cycles per) - ea write stall - io: L2-EA Write Stall - IO (Cycles per) - ea write stall - gmi: L2-EA Write Stall - GMI (Cycles per) - ea write stall - dram: L2-EA Write Stall - DRAM (Cycles per) - ea write stall - starve: L2-EA Write Stall - Starve (Cycles per) + read req: L2 Read (Requests) + write req: L2 Write (Requests) + atomic req: L2 Atomic (Requests) + ea read req: L2-Fabric Read (Requests) + ea write req: L2-Fabric Write and Atomic (Requests) + ea atomic req: L2-Fabric Atomic (Requests) + ea read lat - cycles: L2-Fabric Read Latency (Cycles) + ea write lat - cycles: L2-Fabric Write Latency (Cycles) + ea atomic lat - cycles: L2-Fabric Atomic Latency (Cycles) + ea read stall - io: L2-Fabric Read Stall - PCIe (Cycles per) + ea read stall - gmi: L2-Fabric Read Stall - Infinity Fabric™ (Cycles per) + ea read stall - dram: L2-Fabric Read Stall - HBM (Cycles per) + ea write stall - io: L2-Fabric Write Stall - PCIe (Cycles per) + ea write stall - gmi: L2-Fabric Write Stall - Infinity Fabric™ (Cycles per) + ea write stall - dram: L2-Fabric Write Stall - HBM (Cycles per) + ea write stall - starve: L2-Fabric Write Stall - Starve (Cycles per) tips: Tips metric: "16": diff --git a/src/omniperf_analyze/configs/gfx908/0200_system-speed-of-light.yaml b/src/omniperf_analyze/configs/gfx908/0200_system-speed-of-light.yaml index 986b2f0ae..bc9dea77f 100644 --- a/src/omniperf_analyze/configs/gfx908/0200_system-speed-of-light.yaml +++ b/src/omniperf_analyze/configs/gfx908/0200_system-speed-of-light.yaml @@ -14,10 +14,10 @@ Panel Config: title: Speed-of-Light header: metric: Metric - value: Value + value: Avg unit: Unit peak: Peak - pop: PoP + pop: Pct of Peak tips: Tips metric: VALU FLOPs: @@ -28,7 +28,7 @@ Panel Config: tips: VALU IOPs: value: None # No perf counter - unit: GOPs + unit: GIOPs peak: (((($sclk * $numCU) * 64) * 2) / 1000) pop: None # No perf counter tips: @@ -68,25 +68,37 @@ Panel Config: peak: $numCU pop: ((100 * $numActiveCUs) / $numCU) tips: - SALU Util: + SALU Utililization: value: AVG(((100 * SQ_ACTIVE_INST_SCA) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct peak: 100 pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / (GRBM_GUI_ACTIVE * $numCU))) tips: - VALU Util: + VALU Utililization: value: AVG(((100 * SQ_ACTIVE_INST_VALU) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct peak: 100 pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / (GRBM_GUI_ACTIVE * $numCU))) tips: - MFMA Util: + MFMA Utililization: value: None # No HW module unit: pct peak: 100 pop: None # No HW module tips: - VALU Active Threads/Wave: + VMEM Utilization: + value: None # No HW module + unit: pct + peak: 100 + pop: None # No HW module + tips: + Branch Utilization: + value: None # No HW module + unit: pct + peak: 100 + pop: None # No HW module + tips: + VALU Active Threads: value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU != 0) else None)) unit: Threads @@ -94,25 +106,29 @@ Panel Config: pop: (AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU != 0) else None)) * 1.5625) tips: - IPC - Issue: - value: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) - + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) - / SQ_ACTIVE_INST_ANY)) + IPC: + value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) unit: Instr/cycle peak: 5 - pop: ((100 * AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) - + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) - / SQ_ACTIVE_INST_ANY))) / 5) + pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) tips: - LDS BW: + Wavefront Occupancy: + value: AVG((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE)) + unit: Wavefronts + peak: ($maxWavesPerCU * $numCU) + pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE) / ($maxWavesPerCU + * $numCU)))) + coll_level: SQ_LEVEL_WAVES + tips: + Theoretical LDS Bandwidth: value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) / (EndNs - BeginNs))) - unit: GB/sec + unit: GB/s peak: (($sclk * $numCU) * 0.128) pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) / (EndNs - BeginNs)) / (($sclk * $numCU) * 0.00128))) tips: - LDS Bank Conflict: + LDS Bank Conflicts/Access: value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) unit: Conflicts/access @@ -120,35 +136,7 @@ Panel Config: pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32) tips: - Instr Cache Hit Rate: - value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - unit: pct - peak: 100 - pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - tips: - Instr Cache BW: - value: AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64)) - unit: GB/s - peak: ((($sclk / 1000) * 64) * $numSQC) - pop: ((100 * AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk - / 1000) * 64) * $numSQC)) - tips: - Scalar L1D Cache Hit Rate: - value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - tips: - Scalar L1D Cache BW: - value: AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64)) - unit: GB/s - peak: ((($sclk / 1000) * 64) * $numSQC) - pop: ((100 * AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk - / 1000) * 64) * $numSQC)) - tips: - Vector L1D Cache Hit Rate: + vL1D Cache Hit Rate: value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else @@ -160,7 +148,7 @@ Panel Config: TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else None)) tips: - Vector L1D Cache BW: + vL1D Cache BW: value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))) unit: GB/s peak: ((($sclk / 1000) * 64) * $numCU) @@ -175,6 +163,13 @@ Panel Config: pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else None)) tips: + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 64) / (EndNs - BeginNs))) + unit: GB/s + peak: ((($sclk / 1000) * 64) * TO_INT($L2Banks)) + pop: ((100 * AVG(((TCC_REQ_sum * 64) / (EndNs - BeginNs)))) + / ((($sclk / 1000) * 64) * TO_INT($L2Banks))) + tips: L2-Fabric Read BW: value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64)) / (EndNs - BeginNs))) @@ -195,36 +190,48 @@ Panel Config: value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) unit: Cycles - peak: '' - pop: '' + peak: None + pop: None tips: L2-Fabric Write Latency: value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) unit: Cycles - peak: '' - pop: '' + peak: None + pop: None tips: - Wave Occupancy: - value: AVG((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE)) - unit: Wavefronts - peak: ($maxWavesPerCU * $numCU) - pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE) / ($maxWavesPerCU - * $numCU)))) - coll_level: SQ_LEVEL_WAVES + sL1D Cache Hit Rate: + value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) tips: - Instr Fetch BW: - value: AVG(((SQ_IFETCH / (EndNs - BeginNs)) * 32)) + sL1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64)) unit: GB/s - peak: ((($sclk / 1000) * 32) * $numSQC) - pop: ((100 * AVG(((SQ_IFETCH / (EndNs - BeginNs)) * 32))) / ($numSQC - * (($sclk / 1000) * 32))) - coll_level: SQ_IFETCH_LEVEL + peak: ((($sclk / 1000) * 64) * $numSQC) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk + / 1000) * 64) * $numSQC)) + tips: + L1I Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) tips: - Instr Fetch Latency: + L1I BW: + value: AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64)) + unit: GB/s + peak: ((($sclk / 1000) * 64) * $numSQC) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk + / 1000) * 64) * $numSQC)) + tips: + L1I Fetch Latency: value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) unit: Cycles - peak: '' - pop: '' + peak: None + pop: None coll_level: SQ_IFETCH_LEVEL tips: diff --git a/src/omniperf_analyze/configs/gfx908/0500_command-processor.yaml b/src/omniperf_analyze/configs/gfx908/0500_command-processor.yaml index 525091879..edd42da6e 100644 --- a/src/omniperf_analyze/configs/gfx908/0500_command-processor.yaml +++ b/src/omniperf_analyze/configs/gfx908/0500_command-processor.yaml @@ -19,19 +19,7 @@ Panel Config: unit: Unit tips: Tips metric: - GPU Busy Cycles: - avg: AVG(GRBM_GUI_ACTIVE) - min: MIN(GRBM_GUI_ACTIVE) - max: MAX(GRBM_GUI_ACTIVE) - unit: Cycles/Kernel - tips: - CPF Busy: - avg: AVG(CPF_CPF_STAT_BUSY) - min: MIN(CPF_CPF_STAT_BUSY) - max: MAX(CPF_CPF_STAT_BUSY) - unit: Cycles/Kernel - tips: - CPF Util: + CPF Utilization: avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) @@ -47,15 +35,9 @@ Panel Config: != 0) else None)) max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY != 0) else None)) - unit: Cycles/Kernel - tips: - L2Cache Intf Busy: - avg: AVG(CPF_CPF_TCIU_BUSY) - min: MIN(CPF_CPF_TCIU_BUSY) - max: MAX(CPF_CPF_TCIU_BUSY) - unit: Cycles/Kernel + unit: pct tips: - L2Cache Intf Util: + CPF-L2 Utilization: avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) @@ -64,7 +46,7 @@ Panel Config: if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) unit: pct tips: - L2Cache Intf Stall: + CPF-L2 Stall: avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY != 0) else None)) min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY @@ -73,11 +55,14 @@ Panel Config: != 0) else None)) unit: pct tips: - UTCL1 Stall: - avg: AVG(CPF_CMP_UTCL1_STALL_ON_TRANSLATION) - min: MIN(CPF_CMP_UTCL1_STALL_ON_TRANSLATION) - max: MAX(CPF_CMP_UTCL1_STALL_ON_TRANSLATION) - unit: Cycles/Kernel + CPF-UTCL1 Stall: + avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None) + min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None) + max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None) + unit: pct tips: - metric_table: @@ -91,19 +76,7 @@ Panel Config: unit: Unit tips: Tips metric: - GPU Busy Cycles: - avg: AVG(GRBM_GUI_ACTIVE) - min: MIN(GRBM_GUI_ACTIVE) - max: MAX(GRBM_GUI_ACTIVE) - unit: Cycles - tips: - CPC Busy Cycles: - avg: AVG(CPC_CPC_STAT_BUSY) - min: MIN(CPC_CPC_STAT_BUSY) - max: MAX(CPC_CPC_STAT_BUSY) - unit: Cycles - tips: - CPC Util: + CPC Utilization: avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) @@ -112,12 +85,6 @@ Panel Config: if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) unit: pct tips: - CPC Stall Cycles: - avg: AVG(CPC_CPC_STAT_STALL) - min: MIN(CPC_CPC_STAT_STALL) - max: MAX(CPC_CPC_STAT_STALL) - unit: Cycles - tips: CPC Stall Rate: avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY != 0) else None)) @@ -127,28 +94,19 @@ Panel Config: != 0) else None)) unit: pct tips: - CPC Packet Decoding: - avg: AVG(CPC_ME1_BUSY_FOR_PACKET_DECODE) - min: MIN(CPC_ME1_BUSY_FOR_PACKET_DECODE) - max: MAX(CPC_ME1_BUSY_FOR_PACKET_DECODE) - unit: Cycles - tips: - SPI Intf Busy Cycles: - avg: AVG(CPC_ME1_DC0_SPI_BUSY) - min: MIN(CPC_ME1_DC0_SPI_BUSY) - max: MAX(CPC_ME1_DC0_SPI_BUSY) - unit: Cycles - tips: - SPI Intf Util: - avg: AVG((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) + CPC Packet Decoding Utilization: + avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) unit: pct tips: - L2Cache Intf Util: + CPC-Workgroup Manager Utilization: + avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) + unit: Pct + tips: + CPC-L2 Utilization: avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) @@ -157,19 +115,16 @@ Panel Config: if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) unit: pct tips: - UTCL1 Stall Cycles: - avg: AVG(CPC_UTCL1_STALL_ON_TRANSLATION) - min: MIN(CPC_UTCL1_STALL_ON_TRANSLATION) - max: MAX(CPC_UTCL1_STALL_ON_TRANSLATION) - unit: Cycles - tips: - UTCL2 Intf Busy Cycles: - avg: AVG(CPC_CPC_UTCL2IU_BUSY) - min: MIN(CPC_CPC_UTCL2IU_BUSY) - max: MAX(CPC_CPC_UTCL2IU_BUSY) - unit: Cycles + CPC-UTCL1 Stall: + avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: pct tips: - UTCL2 Intf Util: + CPC-UTCL2 Utilization: avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) diff --git a/src/omniperf_analyze/configs/gfx908/0600_shader-processor-input.yaml b/src/omniperf_analyze/configs/gfx908/0600_shader-processor-input.yaml index 38b81ed4f..24d4036ec 100644 --- a/src/omniperf_analyze/configs/gfx908/0600_shader-processor-input.yaml +++ b/src/omniperf_analyze/configs/gfx908/0600_shader-processor-input.yaml @@ -6,11 +6,11 @@ Metric Description: # Define the panel properties and properties of each metric in the panel. Panel Config: id: 600 - title: Shader Processor Input (SPI) + title: Workgroup Manager (SPI) data source: - metric_table: id: 601 - title: SPI Stats + title: Workgroup Manager Utilizations header: metric: Metric avg: Avg @@ -19,29 +19,35 @@ Panel Config: unit: Unit tips: Tips metric: - GPU Busy: - avg: AVG(GRBM_GUI_ACTIVE) - min: MIN(GRBM_GUI_ACTIVE) - max: MAX(GRBM_GUI_ACTIVE) - unit: Cycles - tips: - CS Busy: - avg: AVG(SPI_CSN_BUSY) - min: MIN(SPI_CSN_BUSY) - max: MAX(SPI_CSN_BUSY) - unit: Cycles - tips: - SPI Busy: - avg: AVG(GRBM_SPI_BUSY) - min: MIN(GRBM_SPI_BUSY) - max: MAX(GRBM_SPI_BUSY) - unit: Cycles - tips: - SQ Busy: - avg: AVG(SQ_BUSY_CYCLES) - min: MIN(SQ_BUSY_CYCLES) - max: MAX(SQ_BUSY_CYCLES) - unit: Cycles + Accelerator Utilization: + avg: AVG(100 * GRBM_GUI_ACTIVE / GRBM_COUNT) + min: MIN(100 * GRBM_GUI_ACTIVE / GRBM_COUNT) + max: MAX(100 * GRBM_GUI_ACTIVE / GRBM_COUNT) + unit: Pct + tips: + Scheduler-Pipe Utilization: + avg: AVG(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE)) + min: MIN(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE)) + max: MAX(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE)) + unit: Pct + tips: + Workgroup Manager Utilization: + avg: AVG(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE) + min: MIN(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE) + max: MAX(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE) + unit: Pct + tips: + Shader Engine Utilization: + avg: AVG(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE)) + min: MIN(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE)) + max: MAX(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE)) + unit: Pct + tips: + SIMD Utilization: + avg: AVG(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct tips: Dispatched Workgroups: avg: AVG(SPI_CSN_NUM_THREADGROUPS) @@ -55,22 +61,27 @@ Panel Config: max: MAX(SPI_CSN_WAVE) unit: Wavefronts tips: - Wave Alloc Failed: - avg: AVG(SPI_RA_REQ_NO_ALLOC) - min: MIN(SPI_RA_REQ_NO_ALLOC) - max: MAX(SPI_RA_REQ_NO_ALLOC) - unit: Cycles - tips: - Wave Alloc Failed - CS: - avg: AVG(SPI_RA_REQ_NO_ALLOC_CSN) - min: MIN(SPI_RA_REQ_NO_ALLOC_CSN) - max: MAX(SPI_RA_REQ_NO_ALLOC_CSN) - unit: Cycles + VGPR Writes: + avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave + tips: + SGPR Writes: + avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave tips: - - metric_table: id: 602 - title: SPI Resource Allocation + title: Workgroup Manager - Resource Allocation header: metric: Metric avg: Avg @@ -79,96 +90,78 @@ Panel Config: unit: Unit tips: Tips metric: - Wave request Failed (CS): - avg: AVG(SPI_RA_REQ_NO_ALLOC_CSN) - min: MIN(SPI_RA_REQ_NO_ALLOC_CSN) - max: MAX(SPI_RA_REQ_NO_ALLOC_CSN) - unit: Cycles - tips: - CS Stall: - avg: AVG(SPI_RA_RES_STALL_CSN) - min: MIN(SPI_RA_RES_STALL_CSN) - max: MAX(SPI_RA_RES_STALL_CSN) - unit: Cycles - tips: - CS Stall Rate: - avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY != + Not-scheduled Rate (Workgroup Manager): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + unit: Pct + tips: + Not-scheduled Rate (Scheduler-Pipe): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + unit: Pct + tips: + Scheduler-Pipe Stall Rate: + avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)) - min: MIN((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY != + min: MIN((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)) - max: MAX((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY != + max: MAX((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)) - unit: pct + unit: Pct tips: - Scratch Stall: - avg: AVG(SPI_RA_TMP_STALL_CSN) - min: MIN(SPI_RA_TMP_STALL_CSN) - max: MAX(SPI_RA_TMP_STALL_CSN) - unit: Cycles + Scratch Stall Rate: + avg: AVG((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None) + min: MIN((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None) + max: MAX((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None) + unit: Pct tips: Insufficient SIMD Waveslots: - avg: AVG(SPI_RA_WAVE_SIMD_FULL_CSN) - min: MIN(SPI_RA_WAVE_SIMD_FULL_CSN) - max: MAX(SPI_RA_WAVE_SIMD_FULL_CSN) - unit: SIMD + avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct tips: Insufficient SIMD VGPRs: - avg: AVG(SPI_RA_VGPR_SIMD_FULL_CSN) - min: MIN(SPI_RA_VGPR_SIMD_FULL_CSN) - max: MAX(SPI_RA_VGPR_SIMD_FULL_CSN) - unit: SIMD + avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct tips: Insufficient SIMD SGPRs: - avg: AVG(SPI_RA_SGPR_SIMD_FULL_CSN) - min: MIN(SPI_RA_SGPR_SIMD_FULL_CSN) - max: MAX(SPI_RA_SGPR_SIMD_FULL_CSN) - unit: SIMD + avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct tips: Insufficient CU LDS: - avg: AVG(SPI_RA_LDS_CU_FULL_CSN) - min: MIN(SPI_RA_LDS_CU_FULL_CSN) - max: MAX(SPI_RA_LDS_CU_FULL_CSN) - unit: CU - tips: - Insufficient CU Barries: - avg: AVG(SPI_RA_BAR_CU_FULL_CSN) - min: MIN(SPI_RA_BAR_CU_FULL_CSN) - max: MAX(SPI_RA_BAR_CU_FULL_CSN) - unit: CU - tips: - Insufficient Bulky Resource: - avg: AVG(SPI_RA_BULKY_CU_FULL_CSN) - min: MIN(SPI_RA_BULKY_CU_FULL_CSN) - max: MAX(SPI_RA_BULKY_CU_FULL_CSN) - unit: CU - tips: - Reach CU Threadgroups Limit: - avg: AVG(SPI_RA_TGLIM_CU_FULL_CSN) - min: MIN(SPI_RA_TGLIM_CU_FULL_CSN) - max: MAX(SPI_RA_TGLIM_CU_FULL_CSN) - unit: Cycles - tips: - Reach CU Wave Limit: - avg: AVG(SPI_RA_WVLIM_STALL_CSN) - min: MIN(SPI_RA_WVLIM_STALL_CSN) - max: MAX(SPI_RA_WVLIM_STALL_CSN) - unit: Cycles + avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct + tips: + Insufficient CU Barriers: + avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct + tips: + Reached CU Workgroup Limit: + avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct + tips: + Reached CU Wavefront Limit: + avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct tips: - VGPR Writes: - avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - unit: Cycles/wave - tips: - SGPR Writes: - avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - unit: Cycles/wave - tips: diff --git a/src/omniperf_analyze/configs/gfx908/0700_wavefront-launch.yaml b/src/omniperf_analyze/configs/gfx908/0700_wavefront-launch.yaml index 70141193e..abcaae418 100644 --- a/src/omniperf_analyze/configs/gfx908/0700_wavefront-launch.yaml +++ b/src/omniperf_analyze/configs/gfx908/0700_wavefront-launch.yaml @@ -77,7 +77,7 @@ Panel Config: avg: AVG(scr) min: MIN(scr) max: MAX(scr) - unit: Bytes + unit: Bytes/Workitem tips: - metric_table: @@ -103,7 +103,7 @@ Panel Config: max: MAX(GRBM_GUI_ACTIVE) unit: Cycle tips: - Instr/wavefront: + Instructions per wavefront: avg: AVG((SQ_INSTS / SQ_WAVES)) min: MIN((SQ_INSTS / SQ_WAVES)) max: MAX((SQ_INSTS / SQ_WAVES)) diff --git a/src/omniperf_analyze/configs/gfx908/1000_compute-unit-instruction-mix.yaml b/src/omniperf_analyze/configs/gfx908/1000_compute-unit-instruction-mix.yaml index 9df6750f6..9aac87117 100644 --- a/src/omniperf_analyze/configs/gfx908/1000_compute-unit-instruction-mix.yaml +++ b/src/omniperf_analyze/configs/gfx908/1000_compute-unit-instruction-mix.yaml @@ -10,7 +10,7 @@ Panel Config: data source: - metric_table: id: 1001 - title: Instruction Mix + title: Overall Instruction Mix header: metric: Metric avg: Avg @@ -22,7 +22,7 @@ Panel Config: type: simple_bar label_txt: (# of instr + $normUnit) metric: - VALU - Vector: + VALU: avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) @@ -40,7 +40,7 @@ Panel Config: max: MAX((SQ_INSTS_LDS / $denom)) unit: (instr + $normUnit) tips: - VALU - MFMA: + MFMA: avg: None # No HW module min: None # No HW module max: None # No HW module @@ -64,12 +64,6 @@ Panel Config: max: MAX((SQ_INSTS_BRANCH / $denom)) unit: (instr + $normUnit) tips: - GDS: - avg: AVG((SQ_INSTS_GDS / $denom)) - min: MIN((SQ_INSTS_GDS / $denom)) - max: MAX((SQ_INSTS_GDS / $denom)) - unit: (instr + $normUnit) - tips: - metric_table: id: 1002 @@ -103,7 +97,7 @@ Panel Config: max: None # No perf counter unit: (instr + $normUnit) tips: - F16-Mult: + F16-MUL: avg: None # No perf counter min: None # No perf counter max: None # No perf counter @@ -127,7 +121,7 @@ Panel Config: max: None # No perf counter unit: (instr + $normUnit) tips: - F32-Mult: + F32-MUL: avg: None # No perf counter min: None # No perf counter max: None # No perf counter @@ -151,7 +145,7 @@ Panel Config: max: None # No perf counter unit: (instr + $normUnit) tips: - F64-Mult: + F64-MUL: avg: None # No perf counter min: None # No perf counter max: None # No perf counter @@ -180,62 +174,65 @@ Panel Config: id: 1003 title: VMEM Instr Mix header: - type: Type - count: Count - tips: Tips - metric: - Buffer Instr: - count: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - tips: - Buffer Read: - count: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - tips: - Buffer Write: - count: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - tips: - Buffer Atomic: - count: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - tips: - Flat Instr: - count: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - tips: - Flat Read: - count: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - tips: - Flat Write: - count: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - tips: - Flat Atomic: - count: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - tips: - - - metric_table: - id: 1004 - title: MFMA Arithmetic Instr Mix - header: - type: Type - count: Count + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit tips: Tips metric: - MFMA-I8: - count: None # No HW module + Global/Generic Instr: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) tips: - MFMA-F16: - count: None # No HW module + Global/Generic Read: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) tips: - MFMA-BF16: - count: None # No HW module + Global/Generic Write: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) tips: - MFMA-F32: - count: None # No HW module + Global/Generic Atomic: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) tips: - MFMA-F64: - count: None # No HW module + Spill/Stack Instr: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Spill/Stack Read: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Spill/Stack Write: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Spill/Stack Atomic: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) tips: - metric_table: - id: 1104 - title: Memory Latencies + id: 1004 + title: MFMA Arithmetic Instr Mix header: metric: Metric avg: Avg @@ -244,27 +241,33 @@ Panel Config: unit: Unit tips: Tips metric: - VMEM Latency: - avg: None # No perf counter - min: None # No perf counter - max: None # No perf counter - unit: Cycles - tips: SQ_INSTS_LEVEL_VMEM - SMEM Latency: - avg: None # No perf counter - min: None # No perf counter - max: None # No perf counter - unit: Cycles - tips: SQ_INSTS_LEVEL_SMEM - Instr Fetch Latency: - avg: None # No perf counter - min: None # No perf counter - max: None # No perf counter - unit: Cycles - tips: SQ_IFETCH_LEVEL - LDS Latency: - avg: None # No perf counter - min: None # No perf counter - max: None # No perf counter - unit: Cycles - tips: SQ_INST_LEVEL_LDS + MFMA-I8: + avg: None # No HW module + min: None # No HW module + max: None # No HW module + unit: (instr + $normUnit) + tips: + MFMA-F16: + avg: None # No HW module + min: None # No HW module + max: None # No HW module + unit: (instr + $normUnit) + tips: + MFMA-BF16: + avg: None # No HW module + min: None # No HW module + max: None # No HW module + unit: (instr + $normUnit) + tips: + MFMA-F32: + avg: None # No HW module + min: None # No HW module + max: None # No HW module + unit: (instr + $normUnit) + tips: + MFMA-F64: + avg: None # No HW module + min: None # No HW module + max: None # No HW module + unit: (instr + $normUnit) + tips: diff --git a/src/omniperf_analyze/configs/gfx908/1100_compute-unit-compute-pipeline.yaml b/src/omniperf_analyze/configs/gfx908/1100_compute-unit-compute-pipeline.yaml index 061311d62..8dfcef927 100644 --- a/src/omniperf_analyze/configs/gfx908/1100_compute-unit-compute-pipeline.yaml +++ b/src/omniperf_analyze/configs/gfx908/1100_compute-unit-compute-pipeline.yaml @@ -13,7 +13,10 @@ Panel Config: title: Speed-of-Light header: metric: Metric - value: Value + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak tips: Tips style: type: simple_bar @@ -21,23 +24,47 @@ Panel Config: label_txt: (%) xrange: [0, 110] metric: - valu_flops_pop: + VALU FLOPs: + value: None # No perf counter + Unit: None + peak: None + pop: None + tips: + VALU IOPs: value: None # No perf counter + Unit: None + peak: None + pop: None tips: - mfma_flops_bf16_pop: + MFMA FLOPs (BF16): value: None # No perf counter + Unit: None + peak: None + pop: None tips: - mfma_flops_f16_pop: + MFMA FLOPs (F16): value: None # No perf counter + Unit: None + peak: None + pop: None tips: - mfma_flops_f32_pop: + MFMA FLOPs (F32): value: None # No perf counter + Unit: None + peak: None + pop: None tips: - mfma_flops_f64_pop: + MFMA FLOPs (F64): value: None # No perf counter + Unit: None + peak: None + pop: None tips: - mfma_flops_i8_pop: + MFMA IOPs (INT8): value: None # No perf counter + Unit: None + peak: None + pop: None tips: - metric_table: @@ -51,36 +78,48 @@ Panel Config: unit: Unit tips: Tips metric: - IPC (Avg): + IPC: avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) unit: Instr/cycle tips: - IPC (Issue): - avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) - + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) + IPC (Issued): + avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) / SQ_ACTIVE_INST_ANY)) - min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) - + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) + min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) / SQ_ACTIVE_INST_ANY)) - max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) - + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) + max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) / SQ_ACTIVE_INST_ANY)) unit: Instr/cycle tips: - SALU Util: + SALU Utilization: avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU)) min: MIN((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU)) max: MAX((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU)) unit: pct tips: - VALU Util: + VALU Utilization: avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU)) min: MIN((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU)) max: MAX((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU)) unit: pct tips: + VMEM Utilization: + avg: None # No HW module + min: None # No HW module + max: None # No HW module + unit: pct + tips: + Branch Utilization: + avg: None # No HW module + min: None # No HW module + max: None # No HW module + unit: pct + tips: VALU Active Threads: avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU != 0) else None)) @@ -90,7 +129,7 @@ Panel Config: != 0) else None)) unit: Threads tips: - MFMA Util: + MFMA Utilization: avg: None # No HW module min: None # No HW module max: None # No HW module @@ -102,6 +141,20 @@ Panel Config: max: None # No HW module unit: cycles/instr tips: + VMEM Latency: + avg: None # No perf counter + min: None # No perf counter + max: None # No perf counter + unit: Cycles + coll_level: SQ_INST_LEVEL_VMEM + tips: + SMEM Latency: + avg: None # No perf counter + min: None # No perf counter + max: None # No perf counter + unit: Cycles + coll_level: SQ_INST_LEVEL_SMEM + tips: - metric_table: id: 1103 @@ -121,7 +174,7 @@ Panel Config: max: None # No perf counter unit: (OPs + $normUnit) tips: - INT8 OPs: + IOPs (Total): avg: None # No perf counter min: None # No perf counter max: None # No perf counter @@ -151,5 +204,11 @@ Panel Config: max: None # No perf counter unit: (OPs + $normUnit) tips: + INT8 OPs: + avg: None # No perf counter + min: None # No perf counter + max: None # No perf counter + unit: (OPs + $normUnit) + tips: diff --git a/src/omniperf_analyze/configs/gfx908/1200_lds.yaml b/src/omniperf_analyze/configs/gfx908/1200_lds.yaml index 3fd52c3b1..1fda7461d 100644 --- a/src/omniperf_analyze/configs/gfx908/1200_lds.yaml +++ b/src/omniperf_analyze/configs/gfx908/1200_lds.yaml @@ -30,11 +30,13 @@ Panel Config: value: AVG(((200 * SQ_ACTIVE_INST_LDS) / (GRBM_GUI_ACTIVE * $numCU))) unit: Pct of Peak tips: - Bandwidth (Pct-of-Peak): + unit: pct + Theoretical Bandwidth: value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) / (EndNs - BeginNs)) / (($sclk * $numCU) * 0.00128))) unit: Pct of Peak tips: + unit: pct Bank Conflict Rate: value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) @@ -58,7 +60,7 @@ Panel Config: max: MAX((SQ_INSTS_LDS / $denom)) unit: (Instr + $normUnit) tips: - Bandwidth: + Theoretical Bandwidth: avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) / $denom)) min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) @@ -67,7 +69,14 @@ Panel Config: / $denom)) unit: (Bytes + $normUnit) tips: - Bank Conficts/Access: + LDS Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_LDS + tips: + Bank Conflicts/Access: avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) @@ -82,7 +91,7 @@ Panel Config: max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) unit: (Cycles + $normUnit) tips: - Atomic Cycles: + Atomic Return Cycles: avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) @@ -110,12 +119,5 @@ Panel Config: avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) - unit: ( + $normUnit) - tips: - LDS Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_LDS - tips: + unit: (Accesses + $normUnit) + tips: \ No newline at end of file diff --git a/src/omniperf_analyze/configs/gfx908/1300_instruction-cache.yaml b/src/omniperf_analyze/configs/gfx908/1300_instruction-cache.yaml index 05dc75980..555bc714a 100644 --- a/src/omniperf_analyze/configs/gfx908/1300_instruction-cache.yaml +++ b/src/omniperf_analyze/configs/gfx908/1300_instruction-cache.yaml @@ -13,7 +13,7 @@ Panel Config: title: Speed-of-Light header: metric: Metric - value: Value + value: Avg unit: Unit tips: Tips style: @@ -27,11 +27,16 @@ Panel Config: * (EndNs - BeginNs)))) unit: Pct of Peak tips: - Cache Hit: + Cache Hit Rate: value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + SQC_ICACHE_MISSES_DUPLICATE))) unit: Pct of Peak tips: + L1I-L2 Bandwidth: + value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($sclk * $numSQC) + * (EndNs - BeginNs)))) + unit: Pct of Peak + tips: - metric_table: id: 1302 @@ -68,7 +73,7 @@ Panel Config: max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) unit: (Misses + $normUnit) tips: - Cache Hit: + Cache Hit Rate: avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + SQC_ICACHE_MISSES_DUPLICATE))) min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + @@ -77,3 +82,27 @@ Panel Config: SQC_ICACHE_MISSES_DUPLICATE))) unit: pct tips: + Instruction Fetch Latency: + avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + coll_level: SQ_IFETCH_LEVEL + tips: + - metric_table: + id: 1303 + title: Instruction Cache - L2 Interface + header: + metric: Metric + mean: Mean + min: Min + max: Max + unit: Unit + tips: Tips + metric: + L1I-L2 Bandwidth: + mean: AVG(((SQC_TC_INST_REQ * 64) / $denom)) + min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) + max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) + unit: (Bytes + $normUnit) + tips: \ No newline at end of file diff --git a/src/omniperf_analyze/configs/gfx908/1400_constant-cache.yaml b/src/omniperf_analyze/configs/gfx908/1400_constant-cache.yaml index 563caad13..aa55fee0c 100644 --- a/src/omniperf_analyze/configs/gfx908/1400_constant-cache.yaml +++ b/src/omniperf_analyze/configs/gfx908/1400_constant-cache.yaml @@ -12,8 +12,8 @@ Panel Config: id: 1401 title: Speed-of-Light header: - mertic: Metric - value: Value + metric: Metric + value: Avg unit: Unit tips: Tips style: @@ -27,12 +27,17 @@ Panel Config: * (EndNs - BeginNs)))) unit: Pct of Peak tips: - Cache Hit: + Cache Hit Rate: value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) unit: Pct of Peak tips: + sL1D-L2 BW: + value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000) + / (2 * ($sclk * $numSQC) * (EndNs - BeginNs))) + unit: Pct of Peak + tips: - metric_table: id: 1402 @@ -69,7 +74,7 @@ Panel Config: max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) unit: (Req + $normUnit) tips: - Cache Hit: + Cache Hit Rate: avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) @@ -138,6 +143,12 @@ Panel Config: unit: Unit tips: Tips metric: + sL1D-L2 BW: + mean: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) + min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) + max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) + unit: (Bytes + $normUnit) + tips: Read Req: avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) min: MIN((SQC_TC_DATA_READ_REQ / $denom)) @@ -156,7 +167,7 @@ Panel Config: max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) unit: (Req + $normUnit) tips: - Stall: + Stall Cycles: avg: AVG((SQC_TC_STALL / $denom)) min: MIN((SQC_TC_STALL / $denom)) max: MAX((SQC_TC_STALL / $denom)) diff --git a/src/omniperf_analyze/configs/gfx908/1500_TA_and_TD.yaml b/src/omniperf_analyze/configs/gfx908/1500_TA_and_TD.yaml index 8f71cedc9..773bb7c76 100644 --- a/src/omniperf_analyze/configs/gfx908/1500_TA_and_TD.yaml +++ b/src/omniperf_analyze/configs/gfx908/1500_TA_and_TD.yaml @@ -6,11 +6,11 @@ Metric Description: # Define the panel properties and properties of each metric in the panel. Panel Config: id: 1500 - title: Texture Addresser and Texture Data (TA/TD) + title: Address Processing Unit and Data Return Path (TA/TD) data source: - metric_table: id: 1501 - title: TA + title: Address Processing Unit header: metric: Metric avg: Avg @@ -19,25 +19,25 @@ Panel Config: unit: Unit tips: Tips metric: - TA Busy: + Address Processing Unit Busy: avg: AVG(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct tips: - TC2TA Addr Stall: + Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct tips: - TC2TA Data Stall: + Data Stall: avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct tips: - TD2TA Addr Stall: + Data-Processor → Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) @@ -47,69 +47,69 @@ Panel Config: avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Flat Instr: + Global/Generic Instructions: avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Flat Read Instr: + Global/Generic Read Instructions: avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Flat Write Instr: + Global/Generic Write Instructions: avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Flat Atomic Instr: + Global/Generic Atomic Instructions: avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Buffer Instr: + Spill/Stack Instructions: avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Buffer Read Instr: + Spill/Stack Read Instructions: avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Buffer Write Instr: + Spill/Stack Write Instructions: avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Buffer Atomic Instr: + Spill/Stack Atomic Instructions: avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Buffer Total Cylces: + Spill/Stack Total Cycles: avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) tips: - Buffer Coalesced Read: + Spill/Stack Coalesced Read: avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) tips: - Buffer Coalesced Write: + Spill/Stack Coalesced Write: avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) @@ -118,7 +118,7 @@ Panel Config: - metric_table: id: 1502 - title: TD + title: Data-Return Path header: metric: Metric avg: Avg @@ -127,48 +127,48 @@ Panel Config: unit: Unit tips: Tips metric: - TD Busy: + Data-Return Busy: avg: AVG(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct tips: - TC2TD Stall: + Cache RAM → Data-Return Stall: avg: AVG(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct tips: - SPI2TD Stall: + Workgroup manager → Data-Return Stall: avg: # No perf counter min: # No perf counter max: # No perf counter unit: pct tips: - Coalescable Instr: + Coalescable Instructions: avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Load Instr: + Read Instructions: avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Store Instr: + Write Instructions: avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Atomic Instr: + Atomic Instructions: avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: diff --git a/src/omniperf_analyze/configs/gfx908/1600_L1_cache.yaml b/src/omniperf_analyze/configs/gfx908/1600_L1_cache.yaml index cac92b1f2..db6b688ab 100644 --- a/src/omniperf_analyze/configs/gfx908/1600_L1_cache.yaml +++ b/src/omniperf_analyze/configs/gfx908/1600_L1_cache.yaml @@ -13,7 +13,7 @@ Panel Config: title: Speed-of-Light header: metric: Metric - value: Value + value: Avg unit: Unit tips: Tips style: @@ -22,26 +22,26 @@ Panel Config: label_txt: (%) xrange: [0, 110] metric: - Buffer Coalescing: - value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) - unit: Pct of Peak - tips: - Cache Util: - value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None)) + Hit rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) unit: Pct of Peak tips: - Cache BW: + Bandwidth: value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs)))) / ((($sclk / 1000) * 64) * $numCU)) unit: Pct of Peak tips: - Cache Hit: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) + Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + tips: + Coalescing: + value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) unit: Pct of Peak tips: @@ -141,11 +141,26 @@ Panel Config: unit: (Req + $normUnit) tips: Cache BW: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))) - unit: GB/s + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + unit: (Bytes + $normUnit) tips: + Cache Hit Rate: + avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / + TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / + TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / + TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + unit: pct + tips: Cache Accesses: avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) @@ -164,22 +179,7 @@ Panel Config: / $denom)) unit: (Req + $normUnit) tips: - Cache Hit Rate: - avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: pct - tips: - Invalidate: + Invalidations: avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) @@ -188,9 +188,9 @@ Panel Config: L1-L2 BW: avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - min: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - max: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) unit: (Bytes + $normUnit) tips: @@ -388,17 +388,17 @@ Panel Config: avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - units: (Hits + $normUnit) + units: (Req + $normUnit) tips: - Misses (Translation): + Translation Misses: avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - units: (Misses + $normUnit) + units: (Req + $normUnit) tips: - Misses (Permission): + Permission Misses: avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - units: (Misses + $normUnit) + units: (Req + $normUnit) tips: diff --git a/src/omniperf_analyze/configs/gfx908/1700_L2_cache.yaml b/src/omniperf_analyze/configs/gfx908/1700_L2_cache.yaml index 0c7b03811..cf782e193 100644 --- a/src/omniperf_analyze/configs/gfx908/1700_L2_cache.yaml +++ b/src/omniperf_analyze/configs/gfx908/1700_L2_cache.yaml @@ -13,31 +13,35 @@ Panel Config: title: Speed-of-Light header: metric: Metric - value: Value + value: Avg unit: Unit tips: Tips style: type: simple_bar metric: - L2 Util: + Utilization: value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($L2Banks) * GRBM_GUI_ACTIVE))) unit: pct + tips: + Bandwidth: + value: ((100 * AVG(((TCC_REQ_sum * 64) / (EndNs - BeginNs)))) / ((($sclk / 1000) * 64) * TO_INT($L2Banks))) + unit: pct tips: - Cache Hit: + Hit Rate: value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)) unit: pct - tips: - L2-EA Rd BW: + tips: + L2-Fabric Read BW: value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64)) / (EndNs - BeginNs))) unit: GB/s - tips: - L2-EA Wr BW: + tips: + L2-Fabric Write and Atomic BW: value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)) / (EndNs - BeginNs))) unit: GB/s - tips: + tips: - metric_table: id: 1702 @@ -50,7 +54,7 @@ Panel Config: unit: Unit tips: Tips metric: - Read BW: + L2-Fabric Read BW: avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64)) / $denom)) min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) @@ -58,8 +62,26 @@ Panel Config: max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64)) / $denom)) unit: (Bytes + $normUnit) - tips: - Write BW: + tips: + HBM Read Traffic: + avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + unit: pct + tips: + Remote Read Traffic: + avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + unit: pct + tips: + Uncached Read Traffic: + avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + unit: pct + tips: + L2-Fabric Write and Atomic BW: avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)) / $denom)) min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) @@ -67,55 +89,31 @@ Panel Config: max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)) / $denom)) unit: (Bytes + $normUnit) - tips: - Read (32B): - avg: AVG((TCC_EA_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA_RDREQ_32B_sum / $denom)) - max: MAX((TCC_EA_RDREQ_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read (Uncached 32B): - avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read (64B): - avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - unit: (Req + $normUnit) - tips: - HBM Read: - avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write (32B): - avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - unit: (Req + $normUnit) - tips: - Write (Uncached 32B): - avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write (64B): - avg: AVG((TCC_EA_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA_WRREQ_64B_sum / $denom)) - max: MAX((TCC_EA_WRREQ_64B_sum / $denom)) - unit: (Req + $normUnit) - tips: - HBM Write: - avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: + tips: + HBM Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + unit: pct + tips: + Remote Write and Atomic Traffic: + avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + unit: pct + tips: + Atomic Traffic: + avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + unit: pct + tips: + Uncached Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + unit: pct + tips: Read Latency: avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) @@ -124,7 +122,7 @@ Panel Config: max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) unit: Cycles - tips: + tips: Write Latency: avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) @@ -133,7 +131,7 @@ Panel Config: max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) unit: Cycles - tips: + tips: Atomic Latency: avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum != 0) else None)) @@ -142,7 +140,7 @@ Panel Config: max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum != 0) else None)) unit: Cycles - tips: + tips: Read Stall: avg: AVG((((100 * ((TCC_EA_RDREQ_IO_CREDIT_STALL_sum + TCC_EA_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != @@ -154,7 +152,7 @@ Panel Config: + TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) unit: pct - tips: + tips: Write Stall: avg: AVG((((100 * ((TCC_EA_WRREQ_IO_CREDIT_STALL_sum + TCC_EA_WRREQ_GMI_CREDIT_STALL_sum) + TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != @@ -166,7 +164,7 @@ Panel Config: + TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) unit: pct - tips: + tips: - metric_table: id: 1703 @@ -179,121 +177,127 @@ Panel Config: unit: Unit tips: Tips metric: + Bandwidth: + avg: AVG((TCC_REQ_sum * 64) / $denom) + min: MIN((TCC_REQ_sum * 64) / $denom) + max: MAX((TCC_REQ_sum * 64) / $denom) + unit: (Bytes + $normUnit) + tips: Req: avg: AVG((TCC_REQ_sum / $denom)) min: MIN((TCC_REQ_sum / $denom)) max: MAX((TCC_REQ_sum / $denom)) unit: (Req + $normUnit) - tips: - Streaming Req: - avg: AVG((TCC_STREAMING_REQ_sum / $denom)) - min: MIN((TCC_STREAMING_REQ_sum / $denom)) - max: MAX((TCC_STREAMING_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: + tips: Read Req: avg: AVG((TCC_READ_sum / $denom)) min: MIN((TCC_READ_sum / $denom)) max: MAX((TCC_READ_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: Write Req: avg: AVG((TCC_WRITE_sum / $denom)) min: MIN((TCC_WRITE_sum / $denom)) max: MAX((TCC_WRITE_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: Atomic Req: avg: AVG((TCC_ATOMIC_sum / $denom)) min: MIN((TCC_ATOMIC_sum / $denom)) max: MAX((TCC_ATOMIC_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: Probe Req: avg: AVG((TCC_PROBE_sum / $denom)) min: MIN((TCC_PROBE_sum / $denom)) max: MAX((TCC_PROBE_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: + Streaming Req: + avg: AVG((TCC_STREAMING_REQ_sum / $denom)) + min: MIN((TCC_STREAMING_REQ_sum / $denom)) + max: MAX((TCC_STREAMING_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: + Cache Hit: + avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + tips: Hits: avg: AVG((TCC_HIT_sum / $denom)) min: MIN((TCC_HIT_sum / $denom)) max: MAX((TCC_HIT_sum / $denom)) unit: (Hits + $normUnit) - tips: + tips: Misses: avg: AVG((TCC_MISS_sum / $denom)) min: MIN((TCC_MISS_sum / $denom)) max: MAX((TCC_MISS_sum / $denom)) unit: (Misses + $normUnit) - tips: - Cache Hit: - avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - tips: + tips: Writeback: avg: AVG((TCC_WRITEBACK_sum / $denom)) min: MIN((TCC_WRITEBACK_sum / $denom)) max: MAX((TCC_WRITEBACK_sum / $denom)) - unit: ( + $normUnit) - tips: + unit: (Cachelines + $normUnit) + tips: + Writeback (Internal): + avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) + min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) + max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + tips: + Writeback (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + tips: + Evict (Normal): + avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) + min: MIN((TCC_NORMAL_EVICT_sum / $denom)) + max: MAX((TCC_NORMAL_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + tips: + Evict (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + tips: NC Req: avg: AVG((TCC_NC_REQ_sum / $denom)) min: MIN((TCC_NC_REQ_sum / $denom)) max: MAX((TCC_NC_REQ_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: UC Req: avg: AVG((TCC_UC_REQ_sum / $denom)) min: MIN((TCC_UC_REQ_sum / $denom)) max: MAX((TCC_UC_REQ_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: CC Req: avg: AVG((TCC_CC_REQ_sum / $denom)) min: MIN((TCC_CC_REQ_sum / $denom)) max: MAX((TCC_CC_REQ_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: RW Req: avg: AVG((TCC_RW_REQ_sum / $denom)) min: MIN((TCC_RW_REQ_sum / $denom)) max: MAX((TCC_RW_REQ_sum / $denom)) unit: (Req + $normUnit) - tips: - Writeback (Normal): - avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) - min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) - max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) - unit: ( + $normUnit) - tips: - Writeback (TC Req): - avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - unit: ( + $normUnit) - tips: - Evict (Normal): - avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) - min: MIN((TCC_NORMAL_EVICT_sum / $denom)) - max: MAX((TCC_NORMAL_EVICT_sum / $denom)) - unit: ( + $normUnit) - tips: - Evict (TC Req): - avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - unit: ( + $normUnit) - tips: + tips: - metric_table: id: 1704 - title: L2 - EA Interface Stalls + title: L2 - Fabric Interface Stalls header: metric: Metric type: Type @@ -306,59 +310,137 @@ Panel Config: style: type: simple_multi_bar metric: - Read - Remote Socket Stall: - type: Remote Socket Stall + Read - PCIe Stall: + type: PCIe Stall transaction: Read - avg: AVG((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read - Peer GCD Stall: - type: Peer GCD Stall + avg: AVG(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + Read - Infinity Fabric™ Stall: + type: Infinity Fabric™ Stall transaction: Read - avg: AVG((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: + avg: AVG(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: Read - HBM Stall: type: HBM Stall transaction: Read - avg: AVG((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write - Remote Socket Stall: - type: Remote Socket Stall + avg: AVG(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + Write - PCIe Stall: + type: PCIe Stall transaction: Write - avg: AVG((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write - Peer GCD Stall: - type: Peer GCD Stall + avg: AVG(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + Write - Infinity Fabric™ Stall: + type: Infinity Fabric™ Stall transaction: Write - avg: AVG((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: + avg: AVG(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: Write - HBM Stall: - type: HBM Stall + type: HBM Stall transaction: Write - avg: AVG((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: + avg: AVG(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: Write - Credit Starvation: type: Credit Starvation transaction: Write - avg: AVG((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom)) - min: MIN((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom)) - max: MAX((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom)) + avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + + - metric_table: + id: 1705 + title: L2 - Fabric Detailed Transaction Breakdown + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Read (32B): + avg: AVG((TCC_EA_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA_RDREQ_32B_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: + Read (Uncached): + avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + tips: + Read (64B): + avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + unit: (Req + $normUnit) + tips: + HBM Read: + avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + tips: + Remote Read: + avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + tips: + Write and Atomic (32B): + avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + unit: (Req + $normUnit) + tips: + Write and Atomic (Uncached): + avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + tips: + Write and Atomic (64B): + avg: AVG((TCC_EA_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA_WRREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + tips: + HBM Write and Atomic: + avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + tips: + Remote Write and Atomic: + avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + tips: + Atomic: + avg: AVG((TCC_EA_ATOMIC_sum / $denom)) + min: MIN((TCC_EA_ATOMIC_sum / $denom)) + max: MAX((TCC_EA_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + tips: \ No newline at end of file diff --git a/src/omniperf_analyze/configs/gfx908/1800_L2_cache_per_channel.yaml b/src/omniperf_analyze/configs/gfx908/1800_L2_cache_per_channel.yaml index 45f8abb41..54bf67dfc 100644 --- a/src/omniperf_analyze/configs/gfx908/1800_L2_cache_per_channel.yaml +++ b/src/omniperf_analyze/configs/gfx908/1800_L2_cache_per_channel.yaml @@ -167,7 +167,7 @@ Panel Config: + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) unit: pct tips: - Req: + L2 Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_REQ[0]) + TO_INT(TCC_REQ[1])) + TO_INT(TCC_REQ[2])) + TO_INT(TCC_REQ[3])) + TO_INT(TCC_REQ[4])) + TO_INT(TCC_REQ[5])) + TO_INT(TCC_REQ[6])) + TO_INT(TCC_REQ[7])) + TO_INT(TCC_REQ[8])) + TO_INT(TCC_REQ[9])) @@ -206,7 +206,7 @@ Panel Config: + TO_INT(TCC_REQ[30])) + TO_INT(TCC_REQ[31])) / 32) / $denom)) unit: (Req + $normUnit) tips: - L1 - L2 Read Req: + L2 Read Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_READ[0]) + TO_INT(TCC_READ[1])) + TO_INT(TCC_READ[2])) + TO_INT(TCC_READ[3])) + TO_INT(TCC_READ[4])) + TO_INT(TCC_READ[5])) + TO_INT(TCC_READ[6])) + TO_INT(TCC_READ[7])) + TO_INT(TCC_READ[8])) + TO_INT(TCC_READ[9])) @@ -249,7 +249,7 @@ Panel Config: + TO_INT(TCC_READ[31])) / 32) / $denom)) unit: (Req + $normUnit) tips: - L1 - L2 Write Req: + L2 Write Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_WRITE[0]) + TO_INT(TCC_WRITE[1])) + TO_INT(TCC_WRITE[2])) + TO_INT(TCC_WRITE[3])) + TO_INT(TCC_WRITE[4])) + TO_INT(TCC_WRITE[5])) + TO_INT(TCC_WRITE[6])) + TO_INT(TCC_WRITE[7])) + TO_INT(TCC_WRITE[8])) @@ -296,7 +296,7 @@ Panel Config: + TO_INT(TCC_WRITE[30])) + TO_INT(TCC_WRITE[31])) / 32) / $denom)) unit: (Req + $normUnit) tips: - L1 - L2 Atomic Req: + L2 Atomic Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_ATOMIC[0]) + TO_INT(TCC_ATOMIC[1])) + TO_INT(TCC_ATOMIC[2])) + TO_INT(TCC_ATOMIC[3])) + TO_INT(TCC_ATOMIC[4])) + TO_INT(TCC_ATOMIC[5])) + TO_INT(TCC_ATOMIC[6])) + TO_INT(TCC_ATOMIC[7])) @@ -347,7 +347,7 @@ Panel Config: / 32) / $denom)) unit: (Req + $normUnit) tips: - L2 - EA Read Req: + L2 - Fabric Read Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ[0]) + TO_INT(TCC_EA_RDREQ[1])) + TO_INT(TCC_EA_RDREQ[2])) + TO_INT(TCC_EA_RDREQ[3])) + TO_INT(TCC_EA_RDREQ[4])) + TO_INT(TCC_EA_RDREQ[5])) + TO_INT(TCC_EA_RDREQ[6])) + TO_INT(TCC_EA_RDREQ[7])) @@ -398,7 +398,7 @@ Panel Config: / 32) / $denom)) unit: (Req + $normUnit) tips: - L2 - EA Write Req: + L2 - Fabric Write and Atomic Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ[0]) + TO_INT(TCC_EA_WRREQ[1])) + TO_INT(TCC_EA_WRREQ[2])) + TO_INT(TCC_EA_WRREQ[3])) + TO_INT(TCC_EA_WRREQ[4])) + TO_INT(TCC_EA_WRREQ[5])) + TO_INT(TCC_EA_WRREQ[6])) + TO_INT(TCC_EA_WRREQ[7])) @@ -449,7 +449,7 @@ Panel Config: / 32) / $denom)) unit: (Req + $normUnit) tips: - L2 - EA Atomic Req: + L2 - Fabric Atomic Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_ATOMIC[0]) + TO_INT(TCC_EA_ATOMIC[1])) + TO_INT(TCC_EA_ATOMIC[2])) + TO_INT(TCC_EA_ATOMIC[3])) + TO_INT(TCC_EA_ATOMIC[4])) + TO_INT(TCC_EA_ATOMIC[5])) + TO_INT(TCC_EA_ATOMIC[6])) + TO_INT(TCC_EA_ATOMIC[7])) @@ -500,7 +500,7 @@ Panel Config: / 32) / $denom)) unit: (Req + $normUnit) tips: - L2 - EA Read Lat: + L2 - Fabric Read Lat: avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_RDREQ_LEVEL[0] + TCC_EA_RDREQ_LEVEL[1]) + TCC_EA_RDREQ_LEVEL[2]) + TCC_EA_RDREQ_LEVEL[3]) + TCC_EA_RDREQ_LEVEL[4]) + TCC_EA_RDREQ_LEVEL[5]) + TCC_EA_RDREQ_LEVEL[6]) + TCC_EA_RDREQ_LEVEL[7]) @@ -615,7 +615,7 @@ Panel Config: + TCC_EA_RDREQ[29]) + TCC_EA_RDREQ[30]) + TCC_EA_RDREQ[31]) != 0) else None)) unit: Cycles tips: - L2 - EA Write Lat: + L2 - Fabric Write Lat: avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_WRREQ_LEVEL[0] + TCC_EA_WRREQ_LEVEL[1]) + TCC_EA_WRREQ_LEVEL[2]) + TCC_EA_WRREQ_LEVEL[3]) + TCC_EA_WRREQ_LEVEL[4]) + TCC_EA_WRREQ_LEVEL[5]) + TCC_EA_WRREQ_LEVEL[6]) + TCC_EA_WRREQ_LEVEL[7]) @@ -730,7 +730,7 @@ Panel Config: + TCC_EA_WRREQ[29]) + TCC_EA_WRREQ[30]) + TCC_EA_WRREQ[31]) != 0) else None)) unit: Cycles tips: - L2 - EA Atomic Lat: + L2 - Fabric Atomic Lat: avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_ATOMIC_LEVEL[0] + TCC_EA_ATOMIC_LEVEL[1]) + TCC_EA_ATOMIC_LEVEL[2]) + TCC_EA_ATOMIC_LEVEL[3]) + TCC_EA_ATOMIC_LEVEL[4]) + TCC_EA_ATOMIC_LEVEL[5]) + TCC_EA_ATOMIC_LEVEL[6]) + TCC_EA_ATOMIC_LEVEL[7]) @@ -849,7 +849,7 @@ Panel Config: None)) unit: Cycles tips: - L2 - EA Read Stall (IO): + L2 - Fabric Read Stall (PCIe): avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[0]) + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[1])) + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[2])) + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[3])) + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[4])) @@ -920,7 +920,7 @@ Panel Config: + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[31])) / 32) / $denom)) unit: (Cycles + $normUnit) tips: - L2 - EA Read Stall (GMI): + L2 - Fabric Read Stall (Infinity Fabric™): avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[0]) + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[1])) + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[2])) + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[3])) + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[4])) @@ -991,7 +991,7 @@ Panel Config: + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[31])) / 32) / $denom)) unit: (Cycles + $normUnit) tips: - L2 - EA Read Stall (DRAM): + L2 - Fabric Read Stall (HBM): avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[0]) + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[1])) + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[2])) + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[3])) + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[4])) @@ -1062,7 +1062,7 @@ Panel Config: + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[31])) / 32) / $denom)) unit: (Cycles + $normUnit) tips: - L2 - EA Write Stall (IO): + L2 - Fabric Write Stall (PCIe): avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[0]) + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[1])) + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[2])) + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[3])) + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[4])) @@ -1133,7 +1133,7 @@ Panel Config: + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[31])) / 32) / $denom)) unit: (Cycles + $normUnit) tips: - L2 - EA Write Stall (GMI): + L2 - Fabric Write Stall (Infinity Fabric™): avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[0]) + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[1])) + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[2])) + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[3])) + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[4])) @@ -1204,7 +1204,7 @@ Panel Config: + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[31])) / 32) / $denom)) unit: (Cycles + $normUnit) tips: - L2 - EA Write Stall (DRAM): + L2 - Fabric Write Stall (HBM): avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[0]) + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[1])) + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[2])) + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[3])) + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[4])) @@ -1275,7 +1275,7 @@ Panel Config: + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[31])) / 32) / $denom)) unit: (Cycles + $normUnit) tips: - L2 - EA Write Starve: + L2 - Fabric Write Starve: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[0]) + TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[1])) + TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[2])) + TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[3])) + TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[4])) @@ -1354,22 +1354,22 @@ Panel Config: channel: Channel hit rate: L2 Cache Hit Rate (%) req: Requests (Requests) - read req: L1-L2 Read (Requests) - write req: L1-L2 Write (Requests) - atomic req: L1-L2 Atomic (Requests) - ea read req: L2-EA Read (Requests) - ea write req: L2-EA Write (Requests) - ea atomic req: L2-EA Atomic (Requests) - ea read lat - cycles: L2-EA Read Latency (Cycles) - ea write lat - cycles: L2-EA Write Latency (Cycles) - ea atomic lat - cycles: L2-EA Atomic Latency (Cycles) - ea read stall - io: L2-EA Read Stall - IO (Cycles per) - ea read stall - gmi: L2-EA Read Stall - GMI (Cycles per) - ea read stall - dram: L2-EA Read Stall - DRAM (Cycles per) - ea write stall - io: L2-EA Write Stall - IO (Cycles per) - ea write stall - gmi: L2-EA Write Stall - GMI (Cycles per) - ea write stall - dram: L2-EA Write Stall - DRAM (Cycles per) - ea write stall - starve: L2-EA Write Stall - Starve (Cycles per) + read req: L2 Read (Requests) + write req: L2 Write (Requests) + atomic req: L2 Atomic (Requests) + ea read req: L2-Fabric Read (Requests) + ea write req: L2-Fabric Write and Atomic (Requests) + ea atomic req: L2-Fabric Atomic (Requests) + ea read lat - cycles: L2-Fabric Read Latency (Cycles) + ea write lat - cycles: L2-Fabric Write Latency (Cycles) + ea atomic lat - cycles: L2-Fabric Atomic Latency (Cycles) + ea read stall - io: L2-Fabric Read Stall - PCIe (Cycles) + ea read stall - gmi: L2-Fabric Read Stall - Infinity Fabric™ (Cycles) + ea read stall - dram: L2-Fabric Read Stall - HBM (Cycles) + ea write stall - io: L2-Fabric Write Stall - PCIe (Cycles) + ea write stall - gmi: L2-Fabric Write Stall - Infinity Fabric™ (Cycles) + ea write stall - dram: L2-Fabric Write Stall - HBM (Cycles) + ea write stall - starve: L2-Fabric Write Stall - Starve (Cycles) tips: Tips metric: "0": @@ -1829,22 +1829,22 @@ Panel Config: channel: Channel hit rate: L2 Cache Hit Rate (%) req: Requests (Requests) - read req: L1-L2 Read (Requests) - write req: L1-L2 Write (Requests) - atomic req: L1-L2 Atomic (Requests) - ea read req: L2-EA Read (Requests) - ea write req: L2-EA Write (Requests) - ea atomic req: L2-EA Atomic (Requests) - ea read lat - cycles: L2-EA Read Latency (Cycles) - ea write lat - cycles: L2-EA Write Latency (Cycles) - ea atomic lat - cycles: L2-EA Atomic Latency (Cycles) - ea read stall - io: L2-EA Read Stall - IO (Cycles per) - ea read stall - gmi: L2-EA Read Stall - GMI (Cycles per) - ea read stall - dram: L2-EA Read Stall - DRAM (Cycles per) - ea write stall - io: L2-EA Write Stall - IO (Cycles per) - ea write stall - gmi: L2-EA Write Stall - GMI (Cycles per) - ea write stall - dram: L2-EA Write Stall - DRAM (Cycles per) - ea write stall - starve: L2-EA Write Stall - Starve (Cycles per) + read req: L2 Read (Requests) + write req: L2 Write (Requests) + atomic req: L2 Atomic (Requests) + ea read req: L2-Fabric Read (Requests) + ea write req: L2-Fabric Write and Atomic (Requests) + ea atomic req: L2-Fabric Atomic (Requests) + ea read lat - cycles: L2-Fabric Read Latency (Cycles) + ea write lat - cycles: L2-Fabric Write Latency (Cycles) + ea atomic lat - cycles: L2-Fabric Atomic Latency (Cycles) + ea read stall - io: L2-Fabric Read Stall - PCIe (Cycles) + ea read stall - gmi: L2-Fabric Read Stall - Infinity Fabric™ (Cycles) + ea read stall - dram: L2-Fabric Read Stall - HBM (Cycles) + ea write stall - io: L2-Fabric Write Stall - PCIe (Cycles) + ea write stall - gmi: L2-Fabric Write Stall - Infinity Fabric™ (Cycles) + ea write stall - dram: L2-Fabric Write Stall - HBM (Cycles) + ea write stall - starve: L2-Fabric Write Stall - Starve (Cycles) tips: Tips metric: "16": diff --git a/src/omniperf_analyze/configs/gfx90a/0200_system-speed-of-light.yaml b/src/omniperf_analyze/configs/gfx90a/0200_system-speed-of-light.yaml index c197c0fc5..4f27676a2 100644 --- a/src/omniperf_analyze/configs/gfx90a/0200_system-speed-of-light.yaml +++ b/src/omniperf_analyze/configs/gfx90a/0200_system-speed-of-light.yaml @@ -14,10 +14,10 @@ Panel Config: title: Speed-of-Light header: metric: Metric - value: Value + value: Avg unit: Unit peak: Peak - pop: PoP + pop: Pct of Peak tips: Tips metric: VALU FLOPs: @@ -83,19 +83,19 @@ Panel Config: peak: $numCU pop: ((100 * $numActiveCUs) / $numCU) tips: - SALU Util: + SALU Utilization: value: AVG(((100 * SQ_ACTIVE_INST_SCA) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct peak: 100 pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / (GRBM_GUI_ACTIVE * $numCU))) tips: - VALU Util: + VALU Utilization: value: AVG(((100 * SQ_ACTIVE_INST_VALU) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct peak: 100 pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / (GRBM_GUI_ACTIVE * $numCU))) tips: - MFMA Util: + MFMA Utilization: value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((GRBM_GUI_ACTIVE * $numCU) * 4))) unit: pct @@ -103,7 +103,20 @@ Panel Config: pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((GRBM_GUI_ACTIVE * $numCU) * 4))) tips: - VALU Active Threads/Wave: + VMEM Utilization: + value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / GRBM_GUI_ACTIVE) / $numCU)) + unit: pct + peak: 100 + pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / GRBM_GUI_ACTIVE) / $numCU)) + tips: + Branch Utilization: + value: AVG((((100 * SQ_ACTIVE_INST_MISC) / GRBM_GUI_ACTIVE) / $numCU)) + unit: pct + peak: 100 + pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / GRBM_GUI_ACTIVE) / $numCU)) + unit: pct + tips: + VALU Active Threads: value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU != 0) else None)) unit: Threads @@ -111,25 +124,29 @@ Panel Config: pop: (AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU != 0) else None)) * 1.5625) tips: - IPC - Issue: - value: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) - + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) - / SQ_ACTIVE_INST_ANY)) + IPC: + value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) unit: Instr/cycle peak: 5 - pop: ((100 * AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) - + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) - / SQ_ACTIVE_INST_ANY))) / 5) + pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) + tips: + Wavefront Occupancy: + value: AVG((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE)) + unit: Wavefronts + peak: ($maxWavesPerCU * $numCU) + pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE) / ($maxWavesPerCU + * $numCU)))) + coll_level: SQ_LEVEL_WAVES tips: - LDS BW: + Theoretical LDS Bandwidth: value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) / (EndNs - BeginNs))) - unit: GB/sec + unit: GB/s peak: (($sclk * $numCU) * 0.128) pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) / (EndNs - BeginNs)) / (($sclk * $numCU) * 0.00128))) tips: - LDS Bank Conflict: + LDS Bank Conflicts/Access: value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) unit: Conflicts/access @@ -137,35 +154,7 @@ Panel Config: pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32) tips: - Instr Cache Hit Rate: - value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - unit: pct - peak: 100 - pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - tips: - Instr Cache BW: - value: AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64)) - unit: GB/s - peak: ((($sclk / 1000) * 64) * $numSQC) - pop: ((100 * AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk - / 1000) * 64) * $numSQC)) - tips: - Scalar L1D Cache Hit Rate: - value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - tips: - Scalar L1D Cache BW: - value: AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64)) - unit: GB/s - peak: ((($sclk / 1000) * 64) * $numSQC) - pop: ((100 * AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk - / 1000) * 64) * $numSQC)) - tips: - Vector L1D Cache Hit Rate: + vL1D Cache Hit Rate: value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else @@ -177,7 +166,7 @@ Panel Config: TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else None)) tips: - Vector L1D Cache BW: + vL1D Cache BW: value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))) unit: GB/s peak: ((($sclk / 1000) * 64) * $numCU) @@ -192,6 +181,13 @@ Panel Config: pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else None)) tips: + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 128) / (EndNs - BeginNs))) + unit: GB/s + peak: ((($sclk / 1000) * 128) * TO_INT($L2Banks)) + pop: ((100 * AVG(((TCC_REQ_sum * 128) / (EndNs - BeginNs)))) + / ((($sclk / 1000) * 128) * TO_INT($L2Banks))) + tips: L2-Fabric Read BW: value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64)) / (EndNs - BeginNs))) @@ -212,36 +208,48 @@ Panel Config: value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) unit: Cycles - peak: '' - pop: '' + peak: None + pop: None tips: L2-Fabric Write Latency: value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) unit: Cycles - peak: '' - pop: '' + peak: None + pop: None tips: - Wave Occupancy: - value: AVG((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE)) - unit: Wavefronts - peak: ($maxWavesPerCU * $numCU) - pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE) / ($maxWavesPerCU - * $numCU)))) - coll_level: SQ_LEVEL_WAVES + sL1D Cache Hit Rate: + value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) tips: - Instr Fetch BW: - value: AVG(((SQ_IFETCH / (EndNs - BeginNs)) * 32)) + sL1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64)) unit: GB/s - peak: ((($sclk / 1000) * 32) * $numSQC) - pop: ((100 * AVG(((SQ_IFETCH / (EndNs - BeginNs)) * 32))) / ($numSQC - * (($sclk / 1000) * 32))) - coll_level: SQ_IFETCH_LEVEL + peak: ((($sclk / 1000) * 64) * $numSQC) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk + / 1000) * 64) * $numSQC)) + tips: + L1I Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + tips: + L1I BW: + value: AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64)) + unit: GB/s + peak: ((($sclk / 1000) * 64) * $numSQC) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk + / 1000) * 64) * $numSQC)) tips: - Instr Fetch Latency: + L1I Fetch Latency: value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) unit: Cycles - peak: '' - pop: '' + peak: None + pop: None coll_level: SQ_IFETCH_LEVEL tips: diff --git a/src/omniperf_analyze/configs/gfx90a/0500_command-processor.yaml b/src/omniperf_analyze/configs/gfx90a/0500_command-processor.yaml index d954f6162..b4a1f0b10 100644 --- a/src/omniperf_analyze/configs/gfx90a/0500_command-processor.yaml +++ b/src/omniperf_analyze/configs/gfx90a/0500_command-processor.yaml @@ -19,19 +19,7 @@ Panel Config: unit: Unit tips: Tips metric: - GPU Busy Cycles: - avg: AVG(GRBM_GUI_ACTIVE) - min: MIN(GRBM_GUI_ACTIVE) - max: MAX(GRBM_GUI_ACTIVE) - unit: Cycles/Kernel - tips: - CPF Busy: - avg: AVG(CPF_CPF_STAT_BUSY) - min: MIN(CPF_CPF_STAT_BUSY) - max: MAX(CPF_CPF_STAT_BUSY) - unit: Cycles/Kernel - tips: - CPF Util: + CPF Utilization: avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) @@ -47,15 +35,9 @@ Panel Config: != 0) else None)) max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY != 0) else None)) - unit: Cycles/Kernel - tips: - L2Cache Intf Busy: - avg: AVG(CPF_CPF_TCIU_BUSY) - min: MIN(CPF_CPF_TCIU_BUSY) - max: MAX(CPF_CPF_TCIU_BUSY) - unit: Cycles/Kernel + unit: pct tips: - L2Cache Intf Util: + CPF-L2 Utilization: avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) @@ -64,7 +46,7 @@ Panel Config: if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) unit: pct tips: - L2Cache Intf Stall: + CPF-L2 Stall: avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY != 0) else None)) min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY @@ -73,16 +55,19 @@ Panel Config: != 0) else None)) unit: pct tips: - UTCL1 Stall: - avg: AVG(CPF_CMP_UTCL1_STALL_ON_TRANSLATION) - min: MIN(CPF_CMP_UTCL1_STALL_ON_TRANSLATION) - max: MAX(CPF_CMP_UTCL1_STALL_ON_TRANSLATION) - unit: Cycles/Kernel - tips: + CPF-UTCL1 Stall: + avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None) + min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None) + max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None) + unit: pct + tips: - metric_table: id: 502 - title: Command Processor Compute + title: Packet Processor header: metric: Metric avg: Avg @@ -91,19 +76,7 @@ Panel Config: unit: Unit tips: Tips metric: - GPU Busy Cycles: - avg: AVG(GRBM_GUI_ACTIVE) - min: MIN(GRBM_GUI_ACTIVE) - max: MAX(GRBM_GUI_ACTIVE) - unit: Cycles - tips: - CPC Busy Cycles: - avg: AVG(CPC_CPC_STAT_BUSY) - min: MIN(CPC_CPC_STAT_BUSY) - max: MAX(CPC_CPC_STAT_BUSY) - unit: Cycles - tips: - CPC Util: + CPC Utilization: avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) @@ -112,12 +85,6 @@ Panel Config: if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) unit: pct tips: - CPC Stall Cycles: - avg: AVG(CPC_CPC_STAT_STALL) - min: MIN(CPC_CPC_STAT_STALL) - max: MAX(CPC_CPC_STAT_STALL) - unit: Cycles - tips: CPC Stall Rate: avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY != 0) else None)) @@ -127,28 +94,19 @@ Panel Config: != 0) else None)) unit: pct tips: - CPC Packet Decoding: - avg: AVG(CPC_ME1_BUSY_FOR_PACKET_DECODE) - min: MIN(CPC_ME1_BUSY_FOR_PACKET_DECODE) - max: MAX(CPC_ME1_BUSY_FOR_PACKET_DECODE) - unit: Cycles - tips: - SPI Intf Busy Cycles: - avg: AVG(CPC_ME1_DC0_SPI_BUSY) - min: MIN(CPC_ME1_DC0_SPI_BUSY) - max: MAX(CPC_ME1_DC0_SPI_BUSY) - unit: Cycles - tips: - SPI Intf Util: - avg: AVG((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) + CPC Packet Decoding Utilization: + avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) unit: pct tips: - L2Cache Intf Util: + CPC-Workgroup Manager Utilization: + avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) + unit: Pct + tips: + CPC-L2 Utilization: avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) @@ -157,19 +115,16 @@ Panel Config: if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) unit: pct tips: - UTCL1 Stall Cycles: - avg: AVG(CPC_UTCL1_STALL_ON_TRANSLATION) - min: MIN(CPC_UTCL1_STALL_ON_TRANSLATION) - max: MAX(CPC_UTCL1_STALL_ON_TRANSLATION) - unit: Cycles - tips: - UTCL2 Intf Busy Cycles: - avg: AVG(CPC_CPC_UTCL2IU_BUSY) - min: MIN(CPC_CPC_UTCL2IU_BUSY) - max: MAX(CPC_CPC_UTCL2IU_BUSY) - unit: Cycles + CPC-UTCL1 Stall: + avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: pct tips: - UTCL2 Intf Util: + CPC-UTCL2 Utilization: avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) diff --git a/src/omniperf_analyze/configs/gfx90a/0600_shader-processor-input.yaml b/src/omniperf_analyze/configs/gfx90a/0600_shader-processor-input.yaml index bab48700a..24d4036ec 100644 --- a/src/omniperf_analyze/configs/gfx90a/0600_shader-processor-input.yaml +++ b/src/omniperf_analyze/configs/gfx90a/0600_shader-processor-input.yaml @@ -6,11 +6,11 @@ Metric Description: # Define the panel properties and properties of each metric in the panel. Panel Config: id: 600 - title: Shader Processor Input (SPI) + title: Workgroup Manager (SPI) data source: - metric_table: id: 601 - title: SPI Stats + title: Workgroup Manager Utilizations header: metric: Metric avg: Avg @@ -19,29 +19,35 @@ Panel Config: unit: Unit tips: Tips metric: - GPU Busy: - avg: AVG(GRBM_GUI_ACTIVE) - min: MIN(GRBM_GUI_ACTIVE) - max: MAX(GRBM_GUI_ACTIVE) - unit: Cycles - tips: - CS Busy: - avg: AVG(SPI_CSN_BUSY) - min: MIN(SPI_CSN_BUSY) - max: MAX(SPI_CSN_BUSY) - unit: Cycles - tips: - SPI Busy: - avg: AVG(GRBM_SPI_BUSY) - min: MIN(GRBM_SPI_BUSY) - max: MAX(GRBM_SPI_BUSY) - unit: Cycles - tips: - SQ Busy: - avg: AVG(SQ_BUSY_CYCLES) - min: MIN(SQ_BUSY_CYCLES) - max: MAX(SQ_BUSY_CYCLES) - unit: Cycles + Accelerator Utilization: + avg: AVG(100 * GRBM_GUI_ACTIVE / GRBM_COUNT) + min: MIN(100 * GRBM_GUI_ACTIVE / GRBM_COUNT) + max: MAX(100 * GRBM_GUI_ACTIVE / GRBM_COUNT) + unit: Pct + tips: + Scheduler-Pipe Utilization: + avg: AVG(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE)) + min: MIN(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE)) + max: MAX(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE)) + unit: Pct + tips: + Workgroup Manager Utilization: + avg: AVG(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE) + min: MIN(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE) + max: MAX(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE) + unit: Pct + tips: + Shader Engine Utilization: + avg: AVG(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE)) + min: MIN(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE)) + max: MAX(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE)) + unit: Pct + tips: + SIMD Utilization: + avg: AVG(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct tips: Dispatched Workgroups: avg: AVG(SPI_CSN_NUM_THREADGROUPS) @@ -55,22 +61,27 @@ Panel Config: max: MAX(SPI_CSN_WAVE) unit: Wavefronts tips: - Wave Alloc Failed: - avg: AVG(SPI_RA_REQ_NO_ALLOC) - min: MIN(SPI_RA_REQ_NO_ALLOC) - max: MAX(SPI_RA_REQ_NO_ALLOC) - unit: Cycles - tips: - Wave Alloc Failed - CS: - avg: AVG(SPI_RA_REQ_NO_ALLOC_CSN) - min: MIN(SPI_RA_REQ_NO_ALLOC_CSN) - max: MAX(SPI_RA_REQ_NO_ALLOC_CSN) - unit: Cycles + VGPR Writes: + avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave tips: - + SGPR Writes: + avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave + tips: - metric_table: id: 602 - title: SPI Resource Allocation + title: Workgroup Manager - Resource Allocation header: metric: Metric avg: Avg @@ -79,96 +90,78 @@ Panel Config: unit: Unit tips: Tips metric: - Wave request Failed (CS): - avg: AVG(SPI_RA_REQ_NO_ALLOC_CSN) - min: MIN(SPI_RA_REQ_NO_ALLOC_CSN) - max: MAX(SPI_RA_REQ_NO_ALLOC_CSN) - unit: Cycles - tips: - CS Stall: - avg: AVG(SPI_RA_RES_STALL_CSN) - min: MIN(SPI_RA_RES_STALL_CSN) - max: MAX(SPI_RA_RES_STALL_CSN) - unit: Cycles - tips: - CS Stall Rate: - avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY != + Not-scheduled Rate (Workgroup Manager): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + unit: Pct + tips: + Not-scheduled Rate (Scheduler-Pipe): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != + 0) else None) + unit: Pct + tips: + Scheduler-Pipe Stall Rate: + avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)) - min: MIN((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY != + min: MIN((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)) - max: MAX((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY != + max: MAX((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)) - unit: pct + unit: Pct tips: - Scratch Stall: - avg: AVG(SPI_RA_TMP_STALL_CSN) - min: MIN(SPI_RA_TMP_STALL_CSN) - max: MAX(SPI_RA_TMP_STALL_CSN) - unit: Cycles + Scratch Stall Rate: + avg: AVG((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None) + min: MIN((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None) + max: MAX((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None) + unit: Pct tips: Insufficient SIMD Waveslots: - avg: AVG(SPI_RA_WAVE_SIMD_FULL_CSN) - min: MIN(SPI_RA_WAVE_SIMD_FULL_CSN) - max: MAX(SPI_RA_WAVE_SIMD_FULL_CSN) - unit: SIMD + avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct tips: Insufficient SIMD VGPRs: - avg: AVG(SPI_RA_VGPR_SIMD_FULL_CSN) - min: MIN(SPI_RA_VGPR_SIMD_FULL_CSN) - max: MAX(SPI_RA_VGPR_SIMD_FULL_CSN) - unit: SIMD + avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct tips: Insufficient SIMD SGPRs: - avg: AVG(SPI_RA_SGPR_SIMD_FULL_CSN) - min: MIN(SPI_RA_SGPR_SIMD_FULL_CSN) - max: MAX(SPI_RA_SGPR_SIMD_FULL_CSN) - unit: SIMD + avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct tips: Insufficient CU LDS: - avg: AVG(SPI_RA_LDS_CU_FULL_CSN) - min: MIN(SPI_RA_LDS_CU_FULL_CSN) - max: MAX(SPI_RA_LDS_CU_FULL_CSN) - unit: CU - tips: - Insufficient CU Barries: - avg: AVG(SPI_RA_BAR_CU_FULL_CSN) - min: MIN(SPI_RA_BAR_CU_FULL_CSN) - max: MAX(SPI_RA_BAR_CU_FULL_CSN) - unit: CU - tips: - Insufficient Bulky Resource: - avg: AVG(SPI_RA_BULKY_CU_FULL_CSN) - min: MIN(SPI_RA_BULKY_CU_FULL_CSN) - max: MAX(SPI_RA_BULKY_CU_FULL_CSN) - unit: CU - tips: - Reach CU Threadgroups Limit: - avg: AVG(SPI_RA_TGLIM_CU_FULL_CSN) - min: MIN(SPI_RA_TGLIM_CU_FULL_CSN) - max: MAX(SPI_RA_TGLIM_CU_FULL_CSN) - unit: Cycles - tips: - Reach CU Wave Limit: - avg: AVG(SPI_RA_WVLIM_STALL_CSN) - min: MIN(SPI_RA_WVLIM_STALL_CSN) - max: MAX(SPI_RA_WVLIM_STALL_CSN) - unit: Cycles + avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct + tips: + Insufficient CU Barriers: + avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct + tips: + Reached CU Workgroup Limit: + avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct + tips: + Reached CU Wavefront Limit: + avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU)) + unit: Pct tips: - VGPR Writes: - avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - unit: Cycles/wave - tips: - SGPR Writes: - avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - unit: Cycles/wave - tips: diff --git a/src/omniperf_analyze/configs/gfx90a/0700_wavefront-launch.yaml b/src/omniperf_analyze/configs/gfx90a/0700_wavefront-launch.yaml index 13ba5b8e1..5ab83270f 100644 --- a/src/omniperf_analyze/configs/gfx90a/0700_wavefront-launch.yaml +++ b/src/omniperf_analyze/configs/gfx90a/0700_wavefront-launch.yaml @@ -77,7 +77,7 @@ Panel Config: avg: AVG(scr) min: MIN(scr) max: MAX(scr) - unit: Bytes + unit: Bytes/Workitem tips: - metric_table: @@ -103,7 +103,7 @@ Panel Config: max: MAX(GRBM_GUI_ACTIVE) unit: Cycle tips: - Instr/wavefront: + Instructions per wavefront: avg: AVG((SQ_INSTS / SQ_WAVES)) min: MIN((SQ_INSTS / SQ_WAVES)) max: MAX((SQ_INSTS / SQ_WAVES)) diff --git a/src/omniperf_analyze/configs/gfx90a/1000_compute-unit-instruction-mix.yaml b/src/omniperf_analyze/configs/gfx90a/1000_compute-unit-instruction-mix.yaml index 8ffd87d2c..f7867b6ea 100644 --- a/src/omniperf_analyze/configs/gfx90a/1000_compute-unit-instruction-mix.yaml +++ b/src/omniperf_analyze/configs/gfx90a/1000_compute-unit-instruction-mix.yaml @@ -10,7 +10,7 @@ Panel Config: data source: - metric_table: id: 1001 - title: Instruction Mix + title: Overall Instruction Mix header: metric: Metric avg: Avg @@ -22,7 +22,7 @@ Panel Config: type: simple_bar label_txt: (# of instr + $normUnit) metric: - VALU - Vector: + VALU: avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) @@ -40,7 +40,7 @@ Panel Config: max: MAX((SQ_INSTS_LDS / $denom)) unit: (instr + $normUnit) tips: - VALU - MFMA: + MFMA: avg: AVG((SQ_INSTS_MFMA / $denom)) min: MIN((SQ_INSTS_MFMA / $denom)) max: MAX((SQ_INSTS_MFMA / $denom)) @@ -64,12 +64,6 @@ Panel Config: max: MAX((SQ_INSTS_BRANCH / $denom)) unit: (instr + $normUnit) tips: - GDS: - avg: AVG((SQ_INSTS_GDS / $denom)) - min: MIN((SQ_INSTS_GDS / $denom)) - max: MAX((SQ_INSTS_GDS / $denom)) - unit: (instr + $normUnit) - tips: - metric_table: id: 1002 @@ -180,55 +174,100 @@ Panel Config: id: 1003 title: VMEM Instr Mix header: - type: type - count: Count + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit tips: Tips metric: - Buffer Instr: - count: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - tips: - Buffer Read: - count: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - tips: - Buffer Write: - count: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - tips: - Buffer Atomic: - count: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - tips: - Flat Instr: - count: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - tips: - Flat Read: - count: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - tips: - Flat Write: - count: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - tips: - Flat Atomic: - count: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - tips: + Global/Generic Instr: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Global/Generic Read: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Global/Generic Write: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Global/Generic Atomic: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Spill/Stack Instr: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Spill/Stack Read: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Spill/Stack Write: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: + Spill/Stack Atomic: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + tips: - metric_table: id: 1004 title: MFMA Arithmetic Instr Mix header: - type: type - count: Count + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit tips: Tips metric: MFMA-I8: - count: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) + avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) + unit: (instr + $normUnit) tips: MFMA-F16: - count: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) + avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) + unit: (instr + $normUnit) tips: MFMA-BF16: - count: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + unit: (instr + $normUnit) tips: MFMA-F32: - count: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) - tips: + avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) + unit: (instr + $normUnit) + tips: MFMA-F64: - count: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) - tips: + avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) + unit: (instr + $normUnit) + tips: diff --git a/src/omniperf_analyze/configs/gfx90a/1100_compute-unit-compute-pipeline.yaml b/src/omniperf_analyze/configs/gfx90a/1100_compute-unit-compute-pipeline.yaml index 39a144731..04b7d6027 100644 --- a/src/omniperf_analyze/configs/gfx90a/1100_compute-unit-compute-pipeline.yaml +++ b/src/omniperf_analyze/configs/gfx90a/1100_compute-unit-compute-pipeline.yaml @@ -13,8 +13,10 @@ Panel Config: title: Speed-of-Light header: metric: Metric - value: Value + value: Avg unit: Unit + peak: Peak + pop: Pct of Peak tips: Tips style: type: simple_bar @@ -22,39 +24,62 @@ Panel Config: label_txt: (%) xrange: [0, 110] metric: - valu_flops_pop: - value: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) + / (EndNs - BeginNs))) + unit: GFLOP + peak: (((($sclk * $numCU) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) / (EndNs - BeginNs)))) / (((($sclk * $numCU) * 64) * 2) / 1000)) - unit: Pct of Peak tips: - mfma_flops_bf16_pop: - value: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (EndNs - BeginNs)))) - / ((($sclk * $numCU) * 512) / 1000)) - unit: Pct of Peak + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (EndNs - BeginNs))) + unit: GIOP + peak: (((($sclk * $numCU) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (EndNs + - BeginNs)))) / (((($sclk * $numCU) * 64) * 2) / 1000)) tips: - mfma_flops_f16_pop: - value: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (EndNs - BeginNs)))) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (EndNs - BeginNs))) + unit: GFLOP + peak: ((($sclk * $numCU) * 1024) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (EndNs - BeginNs)))) / ((($sclk * $numCU) * 1024) / 1000)) - unit: Pct of Peak tips: - mfma_flops_f32_pop: - value: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (EndNs - BeginNs)))) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (EndNs - BeginNs))) + unit: GFLOP + peak: ((($sclk * $numCU) * 1024) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (EndNs - BeginNs)))) + / ((($sclk * $numCU) * 1024) / 1000)) + tips: + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (EndNs - BeginNs))) + unit: GFLOP + peak: ((($sclk * $numCU) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (EndNs - BeginNs)))) / ((($sclk * $numCU) * 256) / 1000)) - unit: Pct of Peak tips: - mfma_flops_f64_pop: - value: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (EndNs - BeginNs)))) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (EndNs - BeginNs))) + unit: GFLOP + peak: ((($sclk * $numCU) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (EndNs - BeginNs)))) / ((($sclk * $numCU) * 256) / 1000)) - unit: Pct of Peak tips: - mfma_flops_i8_pop: - value: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (EndNs - BeginNs)))) + MFMA IOPs (INT8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (EndNs - BeginNs))) + unit: GIOP + peak: ((($sclk * $numCU) * 1024) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (EndNs - BeginNs)))) / ((($sclk * $numCU) * 1024) / 1000)) - unit: Pct of Peak tips: - metric_table: @@ -68,36 +93,48 @@ Panel Config: unit: Unit tips: Tips metric: - IPC (Avg): + IPC: avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) unit: Instr/cycle tips: - IPC (Issue): - avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) - + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) + IPC (Issued): + avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) / SQ_ACTIVE_INST_ANY)) - min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) - + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) + min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) / SQ_ACTIVE_INST_ANY)) - max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM) - + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED) + max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) / SQ_ACTIVE_INST_ANY)) unit: Instr/cycle tips: - SALU Util: + SALU Utilization: avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU)) min: MIN((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU)) max: MAX((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU)) unit: pct tips: - VALU Util: + VALU Utilization: avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU)) min: MIN((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU)) max: MAX((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU)) unit: pct tips: + VMEM Utilization: + avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / GRBM_GUI_ACTIVE) / $numCU)) + min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / GRBM_GUI_ACTIVE) / $numCU)) + max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / GRBM_GUI_ACTIVE) / $numCU)) + unit: pct + tips: + Branch Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / GRBM_GUI_ACTIVE) / $numCU)) + min: MIN((((100 * SQ_ACTIVE_INST_MISC) / GRBM_GUI_ACTIVE) / $numCU)) + max: MAX((((100 * SQ_ACTIVE_INST_MISC) / GRBM_GUI_ACTIVE) / $numCU)) + unit: pct + tips: VALU Active Threads: avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU != 0) else None)) @@ -107,7 +144,7 @@ Panel Config: != 0) else None)) unit: Threads tips: - MFMA Util: + MFMA Utilization: avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $numCU) * GRBM_GUI_ACTIVE))) min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $numCU) * GRBM_GUI_ACTIVE))) max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $numCU) * GRBM_GUI_ACTIVE))) @@ -122,6 +159,26 @@ Panel Config: else None)) unit: cycles/instr tips: + VMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_VMEM + tips: + SMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_SMEM + tips: - metric_table: id: 1103 @@ -158,10 +215,10 @@ Panel Config: $denom)) unit: (OPs + $normUnit) tips: - INT8 OPs: - avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + IOPs (Total): + avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) + min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) + max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) unit: (OPs + $normUnit) tips: F16 OPs: @@ -200,52 +257,9 @@ Panel Config: + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) unit: (OPs + $normUnit) tips: - - - metric_table: - id: 1104 - title: Memory Latencies - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - VMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - unit: Cycles - tips: SQ_INSTS_LEVEL_VMEM - SMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - unit: Cycles - tips: SQ_INSTS_LEVEL_SMEM - Instr Fetch Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_IFETCH) if (SQ_IFETCH != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_IFETCH) if (SQ_IFETCH != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_IFETCH) if (SQ_IFETCH != 0) - else None)) - unit: Cycles - tips: SQ_IFETCH_LEVEL - LDS Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - unit: Cycles - tips: SQ_INST_LEVEL_LDS - \ No newline at end of file + INT8 OPs: + avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + unit: (OPs + $normUnit) + tips: \ No newline at end of file diff --git a/src/omniperf_analyze/configs/gfx90a/1200_lds.yaml b/src/omniperf_analyze/configs/gfx90a/1200_lds.yaml index 3fd52c3b1..6af1641d1 100644 --- a/src/omniperf_analyze/configs/gfx90a/1200_lds.yaml +++ b/src/omniperf_analyze/configs/gfx90a/1200_lds.yaml @@ -13,7 +13,7 @@ Panel Config: title: Speed-of-Light header: metric: Metric - value: Value + value: Avg unit: Unit tips: Tips style: @@ -26,20 +26,24 @@ Panel Config: value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / (GRBM_GUI_ACTIVE * $numCU))) unit: Pct of Peak tips: + unit: pct Access Rate: value: AVG(((200 * SQ_ACTIVE_INST_LDS) / (GRBM_GUI_ACTIVE * $numCU))) unit: Pct of Peak tips: - Bandwidth (Pct-of-Peak): + unit: pct + Theoretical Bandwidth (% of Peak): value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) / (EndNs - BeginNs)) / (($sclk * $numCU) * 0.00128))) unit: Pct of Peak tips: + unit: pct Bank Conflict Rate: value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) unit: Pct of Peak tips: + unit: pct - metric_table: id: 1202 @@ -58,7 +62,7 @@ Panel Config: max: MAX((SQ_INSTS_LDS / $denom)) unit: (Instr + $normUnit) tips: - Bandwidth: + Theoretical Bandwidth: avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) / $denom)) min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks)) @@ -67,7 +71,14 @@ Panel Config: / $denom)) unit: (Bytes + $normUnit) tips: - Bank Conficts/Access: + LDS Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_LDS + tips: + Bank Conflicts/Access: avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) @@ -82,7 +93,7 @@ Panel Config: max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) unit: (Cycles + $normUnit) tips: - Atomic Cycles: + Atomic Return Cycles: avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) @@ -110,12 +121,5 @@ Panel Config: avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) - unit: ( + $normUnit) - tips: - LDS Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_LDS - tips: + unit: (Accesses + $normUnit) + tips: \ No newline at end of file diff --git a/src/omniperf_analyze/configs/gfx90a/1300_instruction-cache.yaml b/src/omniperf_analyze/configs/gfx90a/1300_instruction-cache.yaml index 329a7edba..98a38e2c2 100644 --- a/src/omniperf_analyze/configs/gfx90a/1300_instruction-cache.yaml +++ b/src/omniperf_analyze/configs/gfx90a/1300_instruction-cache.yaml @@ -13,7 +13,7 @@ Panel Config: title: Speed-of-Light header: metric: Metric - value: Value + value: Avg unit: Unit tips: Tips style: @@ -27,11 +27,16 @@ Panel Config: * (EndNs - BeginNs)))) unit: Pct of Peak tips: - Cache Hit: + Cache Hit Rate: value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + SQC_ICACHE_MISSES_DUPLICATE))) unit: Pct of Peak tips: + L1I-L2 Bandwidth: + value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($sclk * $numSQC) + * (EndNs - BeginNs)))) + unit: Pct of Peak + tips: - metric_table: id: 1302 @@ -68,7 +73,7 @@ Panel Config: max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) unit: (Misses + $normUnit) tips: - Cache Hit: + Cache Hit Rate: avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + SQC_ICACHE_MISSES_DUPLICATE))) min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + @@ -77,3 +82,27 @@ Panel Config: SQC_ICACHE_MISSES_DUPLICATE))) unit: pct tips: + Instruction Fetch Latency: + avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + coll_level: SQ_IFETCH_LEVEL + tips: + - metric_table: + id: 1303 + title: Instruction Cache - L2 Interface + header: + metric: Metric + mean: Mean + min: Min + max: Max + unit: Unit + tips: Tips + metric: + L1I-L2 Bandwidth: + mean: AVG(((SQC_TC_INST_REQ * 64) / $denom)) + min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) + max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) + unit: (Bytes + $normUnit) + tips: \ No newline at end of file diff --git a/src/omniperf_analyze/configs/gfx90a/1400_constant-cache.yaml b/src/omniperf_analyze/configs/gfx90a/1400_constant-cache.yaml index 563caad13..aa55fee0c 100644 --- a/src/omniperf_analyze/configs/gfx90a/1400_constant-cache.yaml +++ b/src/omniperf_analyze/configs/gfx90a/1400_constant-cache.yaml @@ -12,8 +12,8 @@ Panel Config: id: 1401 title: Speed-of-Light header: - mertic: Metric - value: Value + metric: Metric + value: Avg unit: Unit tips: Tips style: @@ -27,12 +27,17 @@ Panel Config: * (EndNs - BeginNs)))) unit: Pct of Peak tips: - Cache Hit: + Cache Hit Rate: value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) unit: Pct of Peak tips: + sL1D-L2 BW: + value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000) + / (2 * ($sclk * $numSQC) * (EndNs - BeginNs))) + unit: Pct of Peak + tips: - metric_table: id: 1402 @@ -69,7 +74,7 @@ Panel Config: max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) unit: (Req + $normUnit) tips: - Cache Hit: + Cache Hit Rate: avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) @@ -138,6 +143,12 @@ Panel Config: unit: Unit tips: Tips metric: + sL1D-L2 BW: + mean: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) + min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) + max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) + unit: (Bytes + $normUnit) + tips: Read Req: avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) min: MIN((SQC_TC_DATA_READ_REQ / $denom)) @@ -156,7 +167,7 @@ Panel Config: max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) unit: (Req + $normUnit) tips: - Stall: + Stall Cycles: avg: AVG((SQC_TC_STALL / $denom)) min: MIN((SQC_TC_STALL / $denom)) max: MAX((SQC_TC_STALL / $denom)) diff --git a/src/omniperf_analyze/configs/gfx90a/1500_TA_and_TD.yaml b/src/omniperf_analyze/configs/gfx90a/1500_TA_and_TD.yaml index 03af85497..5f7d73df8 100644 --- a/src/omniperf_analyze/configs/gfx90a/1500_TA_and_TD.yaml +++ b/src/omniperf_analyze/configs/gfx90a/1500_TA_and_TD.yaml @@ -6,11 +6,11 @@ Metric Description: # Define the panel properties and properties of each metric in the panel. Panel Config: id: 1500 - title: Texture Addresser and Texture Data (TA/TD) + title: Address Processing Unit and Data Return Path (TA/TD) data source: - metric_table: id: 1501 - title: TA + title: Address Processing Unit header: metric: Metric avg: Avg @@ -19,25 +19,25 @@ Panel Config: unit: Unit tips: Tips metric: - TA Busy: + Address Processing Unit Busy: avg: AVG(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct tips: - TC2TA Addr Stall: + Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct tips: - TC2TA Data Stall: + Data Stall: avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct tips: - TD2TA Addr Stall: + Data-Processor → Address Stall: avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU))) @@ -47,69 +47,69 @@ Panel Config: avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Flat Instr: + Global/Generic Instructions: avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Flat Read Instr: + Global/Generic Read Instructions: avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Flat Write Instr: + Global/Generic Write Instructions: avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Flat Atomic Instr: + Global/Generic Atomic Instructions: avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Buffer Instr: + Spill/Stack Instructions: avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Buffer Read Instr: + Spill/Stack Read Instructions: avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Buffer Write Instr: + Spill/Stack Write Instructions: avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Buffer Atomic Instr: + Spill/Stack Atomic Instructions: avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Buffer Total Cylces: + Spill/Stack Total Cycles: avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) tips: - Buffer Coalesced Read: + Spill/Stack Coalesced Read: avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) unit: (Cycles + $normUnit) tips: - Buffer Coalesced Write: + Spill/Stack Coalesced Write: avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) @@ -127,48 +127,48 @@ Panel Config: unit: Unit tips: Tips metric: - TD Busy: + Data-Return Busy: avg: AVG(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct tips: - TC2TD Stall: + Cache RAM → Data-Return Stall: avg: AVG(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct tips: - SPI2TD Stall: + Workgroup manager → Data-Return Stall: avg: AVG(((100 * TD_SPI_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) min: MIN(((100 * TD_SPI_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) max: MAX(((100 * TD_SPI_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU))) unit: pct tips: - Coalescable Instr: + Coalescable Instructions: avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Load Instr: + Read Instructions: avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Store Instr: + Write Instructions: avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: - Atomic Instr: + Atomic Instructions: avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) - unit: (Instr + $normUnit) + unit: (Instructions + $normUnit) tips: diff --git a/src/omniperf_analyze/configs/gfx90a/1600_L1_cache.yaml b/src/omniperf_analyze/configs/gfx90a/1600_L1_cache.yaml index d9291de21..559bbdcb9 100644 --- a/src/omniperf_analyze/configs/gfx90a/1600_L1_cache.yaml +++ b/src/omniperf_analyze/configs/gfx90a/1600_L1_cache.yaml @@ -13,7 +13,7 @@ Panel Config: title: Speed-of-Light header: metric: Metric - value: Value + value: Avg unit: Unit tips: Tips style: @@ -22,26 +22,26 @@ Panel Config: label_txt: (%) xrange: [0, 110] metric: - Buffer Coalescing: - value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) - unit: Pct of Peak - tips: - Cache Util: - value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None)) + Hit rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) unit: Pct of Peak tips: - Cache BW: + Bandwidth: value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs)))) / ((($sclk / 1000) * 64) * $numCU)) unit: Pct of Peak tips: - Cache Hit: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) + Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + tips: + Coalescing: + value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) unit: Pct of Peak tips: @@ -141,11 +141,26 @@ Panel Config: unit: (Req + $normUnit) tips: Cache BW: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))) - unit: GB/s + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + unit: (Bytes + $normUnit) tips: + Cache Hit Rate: + avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / + TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / + TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / + TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else + None)) + unit: pct + tips: Cache Accesses: avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) @@ -164,22 +179,7 @@ Panel Config: / $denom)) unit: (Req + $normUnit) tips: - Cache Hit Rate: - avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: pct - tips: - Invalidate: + Invalidations: avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) @@ -188,9 +188,9 @@ Panel Config: L1-L2 BW: avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - min: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - max: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) unit: (Bytes + $normUnit) tips: @@ -388,17 +388,17 @@ Panel Config: avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - units: (Hits + $normUnit) + units: (Req + $normUnit) tips: - Misses (Translation): + Translation Misses: avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - units: (Misses + $normUnit) + units: (Req + $normUnit) tips: - Misses (Permission): + Permission Misses: avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - units: (Misses + $normUnit) + units: (Req + $normUnit) tips: diff --git a/src/omniperf_analyze/configs/gfx90a/1700_L2_cache.yaml b/src/omniperf_analyze/configs/gfx90a/1700_L2_cache.yaml index ddbaf9155..b2e8c6946 100644 --- a/src/omniperf_analyze/configs/gfx90a/1700_L2_cache.yaml +++ b/src/omniperf_analyze/configs/gfx90a/1700_L2_cache.yaml @@ -13,31 +13,35 @@ Panel Config: title: Speed-of-Light header: metric: Metric - value: Value + value: Avg unit: Unit tips: Tips style: type: simple_bar metric: - L2 Util: + Utilization: value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($L2Banks) * GRBM_GUI_ACTIVE))) unit: pct + tips: + Bandwidth: + value: ((100 * AVG(((TCC_REQ_sum * 128) / (EndNs - BeginNs)))) / ((($sclk / 1000) * 128) * TO_INT($L2Banks))) + unit: pct tips: - Cache Hit: + Hit Rate: value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)) unit: pct - tips: - L2-EA Rd BW: + tips: + L2-Fabric Read BW: value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64)) / (EndNs - BeginNs))) unit: GB/s - tips: - L2-EA Wr BW: + tips: + L2-Fabric Write and Atomic BW: value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)) / (EndNs - BeginNs))) unit: GB/s - tips: + tips: - metric_table: id: 1702 @@ -50,7 +54,7 @@ Panel Config: unit: Unit tips: Tips metric: - Read BW: + L2-Fabric Read BW: avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64)) / $denom)) min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) @@ -58,8 +62,26 @@ Panel Config: max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64)) / $denom)) unit: (Bytes + $normUnit) - tips: - Write BW: + tips: + HBM Read Traffic: + avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + unit: pct + tips: + Remote Read Traffic: + avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + unit: pct + tips: + Uncached Read Traffic: + avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) + unit: pct + tips: + L2-Fabric Write and Atomic BW: avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)) / $denom)) min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) @@ -67,55 +89,31 @@ Panel Config: max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)) / $denom)) unit: (Bytes + $normUnit) - tips: - Read (32B): - avg: AVG((TCC_EA_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA_RDREQ_32B_sum / $denom)) - max: MAX((TCC_EA_RDREQ_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read (Uncached 32B): - avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read (64B): - avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - unit: (Req + $normUnit) - tips: - HBM Read: - avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write (32B): - avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - unit: (Req + $normUnit) - tips: - Write (Uncached 32B): - avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write (64B): - avg: AVG((TCC_EA_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA_WRREQ_64B_sum / $denom)) - max: MAX((TCC_EA_WRREQ_64B_sum / $denom)) - unit: (Req + $normUnit) - tips: - HBM Write: - avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: + tips: + HBM Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + unit: pct + tips: + Remote Write and Atomic Traffic: + avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + unit: pct + tips: + Atomic Traffic: + avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + unit: pct + tips: + Uncached Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) + unit: pct + tips: Read Latency: avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) @@ -124,7 +122,7 @@ Panel Config: max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) unit: Cycles - tips: + tips: Write Latency: avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) @@ -133,7 +131,7 @@ Panel Config: max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) unit: Cycles - tips: + tips: Atomic Latency: avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum != 0) else None)) @@ -142,7 +140,7 @@ Panel Config: max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum != 0) else None)) unit: Cycles - tips: + tips: Read Stall: avg: AVG((((100 * ((TCC_EA_RDREQ_IO_CREDIT_STALL_sum + TCC_EA_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != @@ -154,7 +152,7 @@ Panel Config: + TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) unit: pct - tips: + tips: Write Stall: avg: AVG((((100 * ((TCC_EA_WRREQ_IO_CREDIT_STALL_sum + TCC_EA_WRREQ_GMI_CREDIT_STALL_sum) + TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != @@ -166,7 +164,7 @@ Panel Config: + TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) unit: pct - tips: + tips: - metric_table: id: 1703 @@ -179,117 +177,123 @@ Panel Config: unit: Unit tips: Tips metric: + Bandwidth: + avg: AVG((TCC_REQ_sum * 128) / $denom) + min: MIN((TCC_REQ_sum * 128) / $denom) + max: MAX((TCC_REQ_sum * 128) / $denom) + unit: (Bytes + $normUnit) + tips: Req: avg: AVG((TCC_REQ_sum / $denom)) min: MIN((TCC_REQ_sum / $denom)) max: MAX((TCC_REQ_sum / $denom)) unit: (Req + $normUnit) - tips: - Streaming Req: - avg: AVG((TCC_STREAMING_REQ_sum / $denom)) - min: MIN((TCC_STREAMING_REQ_sum / $denom)) - max: MAX((TCC_STREAMING_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: + tips: Read Req: avg: AVG((TCC_READ_sum / $denom)) min: MIN((TCC_READ_sum / $denom)) max: MAX((TCC_READ_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: Write Req: avg: AVG((TCC_WRITE_sum / $denom)) min: MIN((TCC_WRITE_sum / $denom)) max: MAX((TCC_WRITE_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: Atomic Req: avg: AVG((TCC_ATOMIC_sum / $denom)) min: MIN((TCC_ATOMIC_sum / $denom)) max: MAX((TCC_ATOMIC_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: + Streaming Req: + avg: AVG((TCC_STREAMING_REQ_sum / $denom)) + min: MIN((TCC_STREAMING_REQ_sum / $denom)) + max: MAX((TCC_STREAMING_REQ_sum / $denom)) + unit: (Req + $normUnit) + tips: Probe Req: avg: AVG((TCC_PROBE_sum / $denom)) min: MIN((TCC_PROBE_sum / $denom)) max: MAX((TCC_PROBE_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: + Cache Hit: + avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + tips: Hits: avg: AVG((TCC_HIT_sum / $denom)) min: MIN((TCC_HIT_sum / $denom)) max: MAX((TCC_HIT_sum / $denom)) unit: (Hits + $normUnit) - tips: + tips: Misses: avg: AVG((TCC_MISS_sum / $denom)) min: MIN((TCC_MISS_sum / $denom)) max: MAX((TCC_MISS_sum / $denom)) unit: (Misses + $normUnit) - tips: - Cache Hit: - avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - tips: + tips: Writeback: avg: AVG((TCC_WRITEBACK_sum / $denom)) min: MIN((TCC_WRITEBACK_sum / $denom)) max: MAX((TCC_WRITEBACK_sum / $denom)) - unit: ( + $normUnit) - tips: + unit: (Cachelines + $normUnit) + tips: + Writeback (Internal): + avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) + min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) + max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + tips: + Writeback (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + tips: + Evict (Internal): + avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) + min: MIN((TCC_NORMAL_EVICT_sum / $denom)) + max: MAX((TCC_NORMAL_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + tips: + Evict (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + tips: NC Req: avg: AVG((TCC_NC_REQ_sum / $denom)) min: MIN((TCC_NC_REQ_sum / $denom)) max: MAX((TCC_NC_REQ_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: UC Req: avg: AVG((TCC_UC_REQ_sum / $denom)) min: MIN((TCC_UC_REQ_sum / $denom)) max: MAX((TCC_UC_REQ_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: CC Req: avg: AVG((TCC_CC_REQ_sum / $denom)) min: MIN((TCC_CC_REQ_sum / $denom)) max: MAX((TCC_CC_REQ_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: RW Req: avg: AVG((TCC_RW_REQ_sum / $denom)) min: MIN((TCC_RW_REQ_sum / $denom)) max: MAX((TCC_RW_REQ_sum / $denom)) unit: (Req + $normUnit) - tips: - Writeback (Normal): - avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) - min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) - max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) - unit: ( + $normUnit) - tips: - Writeback (TC Req): - avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - unit: ( + $normUnit) - tips: - Evict (Normal): - avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) - min: MIN((TCC_NORMAL_EVICT_sum / $denom)) - max: MAX((TCC_NORMAL_EVICT_sum / $denom)) - unit: ( + $normUnit) - tips: - Evict (TC Req): - avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - unit: ( + $normUnit) - tips: + tips: - metric_table: id: 1704 @@ -306,59 +310,137 @@ Panel Config: style: type: simple_multi_bar metric: - Read - Remote Socket Stall: - type: Remote Socket Stall + Read - PCIe Stall: + type: PCIe Stall transaction: Read - avg: AVG((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read - Peer GCD Stall: - type: Peer GCD Stall + avg: AVG(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + Read - Infinity Fabric™ Stall: + type: Infinity Fabric™ Stall transaction: Read - avg: AVG((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: + avg: AVG(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: Read - HBM Stall: type: HBM Stall transaction: Read - avg: AVG((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write - Remote Socket Stall: - type: Remote Socket Stall + avg: AVG(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + Write - PCIe Stall: + type: PCIe Stall transaction: Write - avg: AVG((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write - Peer GCD Stall: - type: Peer GCD Stall + avg: AVG(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + Write - Infinity Fabric™ Stall: + type: Infinity Fabric™ Stall transaction: Write - avg: AVG((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: + avg: AVG(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: Write - HBM Stall: type: HBM Stall transaction: Write - avg: AVG((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom)) - min: MIN((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom)) - max: MAX((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom)) - unit: (Req + $normUnit) - tips: + avg: AVG(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: Write - Credit Starvation: type: Credit Starvation transaction: Write - avg: AVG((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom)) - min: MIN((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom)) - max: MAX((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom)) + avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + tips: + + - metric_table: + id: 1705 + title: L2 - Fabric Detailed Transaction Breakdown + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + tips: Tips + metric: + Read (32B): + avg: AVG((TCC_EA_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA_RDREQ_32B_sum / $denom)) unit: (Req + $normUnit) - tips: + tips: + Read (Uncached): + avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + tips: + Read (64B): + avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + unit: (Req + $normUnit) + tips: + HBM Read: + avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + tips: + Remote Read: + avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + tips: + Write and Atomic (32B): + avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + unit: (Req + $normUnit) + tips: + Write and Atomic (Uncached): + avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + tips: + Write and Atomic (64B): + avg: AVG((TCC_EA_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA_WRREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + tips: + HBM Write and Atomic: + avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + tips: + Remote Write and Atomic: + avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + tips: + Atomic: + avg: AVG((TCC_EA_ATOMIC_sum / $denom)) + min: MIN((TCC_EA_ATOMIC_sum / $denom)) + max: MAX((TCC_EA_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + tips: \ No newline at end of file diff --git a/src/omniperf_analyze/configs/gfx90a/1800_L2_cache_per_channel.yaml b/src/omniperf_analyze/configs/gfx90a/1800_L2_cache_per_channel.yaml index c6d93aa61..42d3014b1 100644 --- a/src/omniperf_analyze/configs/gfx90a/1800_L2_cache_per_channel.yaml +++ b/src/omniperf_analyze/configs/gfx90a/1800_L2_cache_per_channel.yaml @@ -167,7 +167,7 @@ Panel Config: + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) unit: pct tips: - Req: + L2 Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_REQ[0]) + TO_INT(TCC_REQ[1])) + TO_INT(TCC_REQ[2])) + TO_INT(TCC_REQ[3])) + TO_INT(TCC_REQ[4])) + TO_INT(TCC_REQ[5])) + TO_INT(TCC_REQ[6])) + TO_INT(TCC_REQ[7])) + TO_INT(TCC_REQ[8])) + TO_INT(TCC_REQ[9])) @@ -206,7 +206,7 @@ Panel Config: + TO_INT(TCC_REQ[30])) + TO_INT(TCC_REQ[31])) / 32) / $denom)) unit: (Req + $normUnit) tips: - L1 - L2 Read Req: + L2 Read Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_READ[0]) + TO_INT(TCC_READ[1])) + TO_INT(TCC_READ[2])) + TO_INT(TCC_READ[3])) + TO_INT(TCC_READ[4])) + TO_INT(TCC_READ[5])) + TO_INT(TCC_READ[6])) + TO_INT(TCC_READ[7])) + TO_INT(TCC_READ[8])) + TO_INT(TCC_READ[9])) @@ -249,7 +249,7 @@ Panel Config: + TO_INT(TCC_READ[31])) / 32) / $denom)) unit: (Req + $normUnit) tips: - L1 - L2 Write Req: + L2 Write Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_WRITE[0]) + TO_INT(TCC_WRITE[1])) + TO_INT(TCC_WRITE[2])) + TO_INT(TCC_WRITE[3])) + TO_INT(TCC_WRITE[4])) + TO_INT(TCC_WRITE[5])) + TO_INT(TCC_WRITE[6])) + TO_INT(TCC_WRITE[7])) + TO_INT(TCC_WRITE[8])) @@ -296,7 +296,7 @@ Panel Config: + TO_INT(TCC_WRITE[30])) + TO_INT(TCC_WRITE[31])) / 32) / $denom)) unit: (Req + $normUnit) tips: - L1 - L2 Atomic Req: + L2 Atomic Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_ATOMIC[0]) + TO_INT(TCC_ATOMIC[1])) + TO_INT(TCC_ATOMIC[2])) + TO_INT(TCC_ATOMIC[3])) + TO_INT(TCC_ATOMIC[4])) + TO_INT(TCC_ATOMIC[5])) + TO_INT(TCC_ATOMIC[6])) + TO_INT(TCC_ATOMIC[7])) @@ -347,7 +347,7 @@ Panel Config: / 32) / $denom)) unit: (Req + $normUnit) tips: - L2 - EA Read Req: + L2 - Fabric Read Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ[0]) + TO_INT(TCC_EA_RDREQ[1])) + TO_INT(TCC_EA_RDREQ[2])) + TO_INT(TCC_EA_RDREQ[3])) + TO_INT(TCC_EA_RDREQ[4])) + TO_INT(TCC_EA_RDREQ[5])) + TO_INT(TCC_EA_RDREQ[6])) + TO_INT(TCC_EA_RDREQ[7])) @@ -398,7 +398,7 @@ Panel Config: / 32) / $denom)) unit: (Req + $normUnit) tips: - L2 - EA Write Req: + L2 - Fabric Write and Atomic Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ[0]) + TO_INT(TCC_EA_WRREQ[1])) + TO_INT(TCC_EA_WRREQ[2])) + TO_INT(TCC_EA_WRREQ[3])) + TO_INT(TCC_EA_WRREQ[4])) + TO_INT(TCC_EA_WRREQ[5])) + TO_INT(TCC_EA_WRREQ[6])) + TO_INT(TCC_EA_WRREQ[7])) @@ -449,7 +449,7 @@ Panel Config: / 32) / $denom)) unit: (Req + $normUnit) tips: - L2 - EA Atomic Req: + L2 - Fabric Atomic Req: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_ATOMIC[0]) + TO_INT(TCC_EA_ATOMIC[1])) + TO_INT(TCC_EA_ATOMIC[2])) + TO_INT(TCC_EA_ATOMIC[3])) + TO_INT(TCC_EA_ATOMIC[4])) + TO_INT(TCC_EA_ATOMIC[5])) + TO_INT(TCC_EA_ATOMIC[6])) + TO_INT(TCC_EA_ATOMIC[7])) @@ -500,7 +500,7 @@ Panel Config: / 32) / $denom)) unit: (Req + $normUnit) tips: - L2 - EA Read Lat: + L2 - Fabric Read Lat: avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_RDREQ_LEVEL[0] + TCC_EA_RDREQ_LEVEL[1]) + TCC_EA_RDREQ_LEVEL[2]) + TCC_EA_RDREQ_LEVEL[3]) + TCC_EA_RDREQ_LEVEL[4]) + TCC_EA_RDREQ_LEVEL[5]) + TCC_EA_RDREQ_LEVEL[6]) + TCC_EA_RDREQ_LEVEL[7]) @@ -615,7 +615,7 @@ Panel Config: + TCC_EA_RDREQ[29]) + TCC_EA_RDREQ[30]) + TCC_EA_RDREQ[31]) != 0) else None)) unit: Cycles tips: - L2 - EA Write Lat: + L2 - Fabric Write Lat: avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_WRREQ_LEVEL[0] + TCC_EA_WRREQ_LEVEL[1]) + TCC_EA_WRREQ_LEVEL[2]) + TCC_EA_WRREQ_LEVEL[3]) + TCC_EA_WRREQ_LEVEL[4]) + TCC_EA_WRREQ_LEVEL[5]) + TCC_EA_WRREQ_LEVEL[6]) + TCC_EA_WRREQ_LEVEL[7]) @@ -730,7 +730,7 @@ Panel Config: + TCC_EA_WRREQ[29]) + TCC_EA_WRREQ[30]) + TCC_EA_WRREQ[31]) != 0) else None)) unit: Cycles tips: - L2 - EA Atomic Lat: + L2 - Fabric Atomic Lat: avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_ATOMIC_LEVEL[0] + TCC_EA_ATOMIC_LEVEL[1]) + TCC_EA_ATOMIC_LEVEL[2]) + TCC_EA_ATOMIC_LEVEL[3]) + TCC_EA_ATOMIC_LEVEL[4]) + TCC_EA_ATOMIC_LEVEL[5]) + TCC_EA_ATOMIC_LEVEL[6]) + TCC_EA_ATOMIC_LEVEL[7]) @@ -849,7 +849,7 @@ Panel Config: None)) unit: Cycles tips: - L2 - EA Read Stall (IO): + L2 - Fabric Read Stall (PCIe): avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[0]) + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[1])) + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[2])) + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[3])) + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[4])) @@ -920,7 +920,7 @@ Panel Config: + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[31])) / 32) / $denom)) unit: (Cycles + $normUnit) tips: - L2 - EA Read Stall (GMI): + L2 - Fabric Read Stall (Infinity Fabric™): avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[0]) + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[1])) + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[2])) + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[3])) + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[4])) @@ -991,7 +991,7 @@ Panel Config: + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[31])) / 32) / $denom)) unit: (Cycles + $normUnit) tips: - L2 - EA Read Stall (DRAM): + L2 - Fabric Read Stall (HBM): avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[0]) + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[1])) + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[2])) + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[3])) + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[4])) @@ -1062,7 +1062,7 @@ Panel Config: + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[31])) / 32) / $denom)) unit: (Cycles + $normUnit) tips: - L2 - EA Write Stall (IO): + L2 - Fabric Write Stall (PCIe): avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[0]) + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[1])) + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[2])) + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[3])) + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[4])) @@ -1133,7 +1133,7 @@ Panel Config: + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[31])) / 32) / $denom)) unit: (Cycles + $normUnit) tips: - L2 - EA Write Stall (GMI): + L2 - Fabric Write Stall (Infinity Fabric™): avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[0]) + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[1])) + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[2])) + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[3])) + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[4])) @@ -1204,7 +1204,7 @@ Panel Config: + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[31])) / 32) / $denom)) unit: (Cycles + $normUnit) tips: - L2 - EA Write Stall (DRAM): + L2 - Fabric Write Stall (HBM): avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[0]) + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[1])) + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[2])) + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[3])) + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[4])) @@ -1275,7 +1275,7 @@ Panel Config: + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[31])) / 32) / $denom)) unit: (Cycles + $normUnit) tips: - L2 - EA Write Starve: + L2 - Fabric Write Starve: avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[0]) + TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[1])) + TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[2])) + TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[3])) + TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[4])) @@ -1354,22 +1354,22 @@ Panel Config: channel: Channel hit rate: L2 Cache Hit Rate (%) req: Requests (Requests) - read req: L1-L2 Read (Requests) - write req: L1-L2 Write (Requests) - atomic req: L1-L2 Atomic (Requests) - ea read req: L2-EA Read (Requests) - ea write req: L2-EA Write (Requests) - ea atomic req: L2-EA Atomic (Requests) - ea read lat - cycles: L2-EA Read Latency (Cycles) - ea write lat - cycles: L2-EA Write Latency (Cycles) - ea atomic lat - cycles: L2-EA Atomic Latency (Cycles) - ea read stall - io: L2-EA Read Stall - IO (Cycles per) - ea read stall - gmi: L2-EA Read Stall - GMI (Cycles per) - ea read stall - dram: L2-EA Read Stall - DRAM (Cycles per) - ea write stall - io: L2-EA Write Stall - IO (Cycles per) - ea write stall - gmi: L2-EA Write Stall - GMI (Cycles per) - ea write stall - dram: L2-EA Write Stall - DRAM (Cycles per) - ea write stall - starve: L2-EA Write Stall - Starve (Cycles per) + read req: L2 Read (Requests) + write req: L2 Write (Requests) + atomic req: L2 Atomic (Requests) + ea read req: L2-Fabric Read (Requests) + ea write req: L2-Fabric Write and Atomic (Requests) + ea atomic req: L2-Fabric Atomic (Requests) + ea read lat - cycles: L2-Fabric Read Latency (Cycles) + ea write lat - cycles: L2-Fabric Write Latency (Cycles) + ea atomic lat - cycles: L2-Fabric Atomic Latency (Cycles) + ea read stall - io: L2-Fabric Read Stall - PCIe (Cycles) + ea read stall - gmi: L2-Fabric Read Stall - Infinity Fabric™ (Cycles) + ea read stall - dram: L2-Fabric Read Stall - HBM (Cycles) + ea write stall - io: L2-Fabric Write Stall - PCIe (Cycles) + ea write stall - gmi: L2-Fabric Write Stall - Infinity Fabric™ (Cycles) + ea write stall - dram: L2-Fabric Write Stall - HBM (Cycles) + ea write stall - starve: L2-Fabric Write Stall - Starve (Cycles) tips: Tips metric: '0': @@ -1764,22 +1764,22 @@ Panel Config: channel: Channel hit rate: L2 Cache Hit Rate (%) req: Requests (Requests) - read req: L1-L2 Read (Requests) - write req: L1-L2 Write (Requests) - atomic req: L1-L2 Atomic (Requests) - ea read req: L2-EA Read (Requests) - ea write req: L2-EA Write (Requests) - ea atomic req: L2-EA Atomic (Requests) - ea read lat - cycles: L2-EA Read Latency (Cycles) - ea write lat - cycles: L2-EA Write Latency (Cycles) - ea atomic lat - cycles: L2-EA Atomic Latency (Cycles) - ea read stall - io: L2-EA Read Stall - IO (Cycles per) - ea read stall - gmi: L2-EA Read Stall - GMI (Cycles per) - ea read stall - dram: L2-EA Read Stall - DRAM (Cycles per) - ea write stall - io: L2-EA Write Stall - IO (Cycles per) - ea write stall - gmi: L2-EA Write Stall - GMI (Cycles per) - ea write stall - dram: L2-EA Write Stall - DRAM (Cycles per) - ea write stall - starve: L2-EA Write Stall - Starve (Cycles per) + read req: L2 Read (Requests) + write req: L2 Write (Requests) + atomic req: L2 Atomic (Requests) + ea read req: L2-Fabric Read (Requests) + ea write req: L2-Fabric Write and Atomic (Requests) + ea atomic req: L2-Fabric Atomic (Requests) + ea read lat - cycles: L2-Fabric Read Latency (Cycles) + ea write lat - cycles: L2-Fabric Write Latency (Cycles) + ea atomic lat - cycles: L2-Fabric Atomic Latency (Cycles) + ea read stall - io: L2-Fabric Read Stall - PCIe (Cycles) + ea read stall - gmi: L2-Fabric Read Stall - Infinity Fabric™ (Cycles) + ea read stall - dram: L2-Fabric Read Stall - HBM (Cycles) + ea write stall - io: L2-Fabric Write Stall - PCIe (Cycles) + ea write stall - gmi: L2-Fabric Write Stall - Infinity Fabric™ (Cycles) + ea write stall - dram: L2-Fabric Write Stall - HBM (Cycles) + ea write stall - starve: L2-Fabric Write Stall - Starve (Cycles) tips: Tips metric: '16': diff --git a/src/omniperf_analyze/configs/panel_config_template.yaml b/src/omniperf_analyze/configs/panel_config_template.yaml index e241896b4..4b81bad0e 100644 --- a/src/omniperf_analyze/configs/panel_config_template.yaml +++ b/src/omniperf_analyze/configs/panel_config_template.yaml @@ -30,7 +30,7 @@ Panel Config: value: Value unit: Unit peak: Peak - pop: PoP + pop: Pct of Peak tips: Tips metric: METRIC01: diff --git a/src/omniperf_analyze/utils/file_io.py b/src/omniperf_analyze/utils/file_io.py index 60850b626..8f9b887d0 100644 --- a/src/omniperf_analyze/utils/file_io.py +++ b/src/omniperf_analyze/utils/file_io.py @@ -129,7 +129,7 @@ def create_df_kernel_top_stats( # NB: support ignoring the 1st n dispatched execution by '> n' # The better way may be parsing python slice string if ">" in filter_dispatch_ids[0]: - m = re.match("\> (\d+)", filter_dispatch_ids[0]) + m = re.match(r"\> (\d+)", filter_dispatch_ids[0]) df = df[df["Index"] > int(m.group(1))] else: df = df.loc[df["Index"].astype(str).isin(filter_dispatch_ids)] diff --git a/src/omniperf_analyze/utils/parser.py b/src/omniperf_analyze/utils/parser.py index ceccb0746..5315d2ada 100644 --- a/src/omniperf_analyze/utils/parser.py +++ b/src/omniperf_analyze/utils/parser.py @@ -113,6 +113,11 @@ def to_min(*args): def to_max(*args): if len(args) == 1 and isinstance(args[0], pd.core.series.Series): return args[0].max() + elif len(args) == 2 and ( + isinstance(args[0], pd.core.series.Series) + or isinstance(args[1], pd.core.series.Series) + ): + return np.maximum(args[0], args[1]) elif max(args) == None: return np.nan else: @@ -268,7 +273,7 @@ def build_eval_string(equation, coll_level): # build-in variable starts with '$', python can not handle it. # replace '$' with 'ammolite__'. # TODO: pre-check there is no "ammolite__" in all config files. - s = re.sub("\$", "ammolite__", s) + s = re.sub(r"\$", "ammolite__", s) # convert equation string to intermediate expression in df array format ast_node = ast.parse(s) @@ -282,7 +287,7 @@ def build_eval_string(equation, coll_level): # the target is df['TCC_HIT[0]'] s = re.sub(r"\'\]\[(\d+)\]", r"[\g<1>]']", s) # use .get() to catch any potential KeyErrors - s = re.sub("raw_pmc_df\['(.*?)']", r'raw_pmc_df.get("\1")', s) + s = re.sub(r"raw_pmc_df\['(.*?)']", r'raw_pmc_df.get("\1")', s) # apply coll_level s = re.sub(r"raw_pmc_df", "raw_pmc_df.get('" + coll_level + "')", s) # print("--- build_eval_string, return: ", s) @@ -306,7 +311,7 @@ def update_denom_string(equation, unit): def update_normUnit_string(equation, unit): """ - Update $normUnit in equation with runtime nomorlization unit. + Update $normUnit in equation with runtime normalization unit. It is string replacement for display only. """ @@ -315,8 +320,8 @@ def update_normUnit_string(equation, unit): return "" return re.sub( - "\((?P\w*)\s+\+\s+(\$normUnit\))", - "\g " + re.sub("_", " ", unit), + r"\((?P\w*)\s+\+\s+(\$normUnit\))", + r"\g " + re.sub("_", " ", unit), str(equation), ).capitalize() @@ -564,9 +569,10 @@ def eval_metric(dfs, dfs_type, sys_info, soc_spec, raw_pmc_df, debug): # NB: # Following with Omniperf 0.2.0, we are using HW spec from sys_info instead. # The soc_spec is not in using right now, but can be used to do verification - # aganist sys_info, forced theoretical evaluation, or supporting tool-chains + # against sys_info, forced theoretical evaluation, or supporting tool-chains # broken. ammolite__numSE = sys_info.numSE + ammolite__numPipes = soc_spec.numPipes ammolite__numCU = sys_info.numCU ammolite__numSIMD = sys_info.numSIMD ammolite__numWavesPerCU = sys_info.maxWavesPerCU # todo: check do we still need it @@ -612,7 +618,7 @@ def eval_metric(dfs, dfs_type, sys_info, soc_spec, raw_pmc_df, debug): print("~" * 40 + "\nExpression:") print(expr, "=", row[expr]) print("Inputs:") - matched_vars = re.findall("ammolite__\w+", row[expr]) + matched_vars = re.findall(r"ammolite__\w+", row[expr]) if matched_vars: for v in matched_vars: print( @@ -622,12 +628,12 @@ def eval_metric(dfs, dfs_type, sys_info, soc_spec, raw_pmc_df, debug): eval(compile(v, "", "eval")), ) matched_cols = re.findall( - "raw_pmc_df\['\w+'\]\['\w+'\]", row[expr] + r"raw_pmc_df\['\w+'\]\['\w+'\]", row[expr] ) if matched_cols: for c in matched_cols: m = re.match( - "raw_pmc_df\['(\w+)'\]\['(\w+)'\]", c + r"raw_pmc_df\['(\w+)'\]\['(\w+)'\]", c ) t = raw_pmc_df[m.group(1)][ m.group(2) @@ -651,7 +657,7 @@ def eval_metric(dfs, dfs_type, sys_info, soc_spec, raw_pmc_df, debug): print("~" * 40) except TypeError: print( - "skiping entry. Encounterd a missing counter" + "skipping entry. Encountered a missing counter" ) print(expr, " has been assigned to None") print(np.nan) @@ -661,7 +667,7 @@ def eval_metric(dfs, dfs_type, sys_info, soc_spec, raw_pmc_df, debug): == "'NoneType' object has no attribute 'get'" ): print( - "skiping entry. Encounterd a missing csv" + "skipping entry. Encountered a missing csv" ) print(np.nan) else: @@ -769,7 +775,7 @@ def apply_filters(workload, dir, is_gui, debug): print("{} is an invalid dispatch id.".format(d)) sys.exit(1) if ">" in workload.filter_dispatch_ids[0]: - m = re.match("\> (\d+)", workload.filter_dispatch_ids[0]) + m = re.match(r"\> (\d+)", workload.filter_dispatch_ids[0]) ret_df = ret_df[ ret_df[schema.pmc_perf_file_prefix]["Index"] > int(m.group(1)) ] diff --git a/src/omniperf_analyze/utils/roofline_calc.py b/src/omniperf_analyze/utils/roofline_calc.py index 275005233..ee00b2458 100644 --- a/src/omniperf_analyze/utils/roofline_calc.py +++ b/src/omniperf_analyze/utils/roofline_calc.py @@ -184,7 +184,7 @@ def plot_roof(roof_details, roof_data, mem_level, verbose): # ------------------------------------------------------------------------------------- # Overlay application performance # ------------------------------------------------------------------------------------- -# Calculate relevent metrics for ai calculation +# Calculate relevant metrics for ai calculation def plot_application(sortType, ret_df, verbose): df = ret_df["pmc_perf"] # Sort by top kernels or top dispatches? diff --git a/src/omniperf_analyze/utils/schema.py b/src/omniperf_analyze/utils/schema.py index f9b59868f..adc19a504 100644 --- a/src/omniperf_analyze/utils/schema.py +++ b/src/omniperf_analyze/utils/schema.py @@ -79,7 +79,7 @@ class Workload: "Min", "Max", "Avg", - "PoP", + "Pct of Peak", "Peak", "Count", "Mean", @@ -91,22 +91,22 @@ class Workload: "Channel", "L2 Cache Hit Rate (%)", "Requests (Requests)", - "L1-L2 Read (Requests)", - "L1-L2 Write (Requests)", - "L1-L2 Atomic (Requests)", - "L2-EA Read (Requests)", - "L2-EA Write (Requests)", - "L2-EA Atomic (Requests)", - "L2-EA Read Latency (Cycles)", - "L2-EA Write Latency (Cycles)", - "L2-EA Atomic Latency (Cycles)", - "L2-EA Read Stall - IO (Cycles per)", - "L2-EA Read Stall - GMI (Cycles per)", - "L2-EA Read Stall - DRAM (Cycles per)", - "L2-EA Write Stall - IO (Cycles per)", - "L2-EA Write Stall - GMI (Cycles per)", - "L2-EA Write Stall - DRAM (Cycles per)", - "L2-EA Write Stall - Starve (Cycles per)", + "L2 Read (Requests)", + "L2 Write (Requests)", + "L2 Atomic (Requests)", + "L2-Fabric Read (Requests)", + "L2-Fabric Write and Atomic (Requests)", + "L2-Fabric Atomic (Requests)", + "L2-Fabric Read Latency (Cycles)", + "L2-Fabric Write Latency (Cycles)", + "L2-Fabric Atomic Latency (Cycles)", + "L2-Fabric Read Stall - PCIe (Cycles)", + "L2-Fabric Read Stall - Infinity Fabric™ (Cycles)", + "L2-Fabric Read Stall - HBM (Cycles)", + "L2-Fabric Write Stall - PCIe (Cycles)", + "L2-Fabric Write Stall - Infinity Fabric™ (Cycles)", + "L2-Fabric Write Stall - HBM (Cycles)", + "L2-Fabric Write Stall - Starve (Cycles)", ] # The prefix of raw pmc_perf.csv diff --git a/src/perfmon_pub/mi100/pmc_tcc_perf.txt b/src/perfmon_pub/mi100/pmc_tcc_perf.txt index 8a6d61de4..7aa7bef20 100644 --- a/src/perfmon_pub/mi100/pmc_tcc_perf.txt +++ b/src/perfmon_pub/mi100/pmc_tcc_perf.txt @@ -4,12 +4,12 @@ pmc: TCC_CYCLE_sum TCC_BUSY_sum pmc: TCC_NC_REQ_sum TCC_UC_REQ_sum TCC_CC_REQ_sum TCC_RW_REQ_sum pmc: TCC_REQ_sum TCC_STREAMING_REQ_sum TCC_HIT_sum TCC_MISS_sum pmc: TCC_READ_sum TCC_WRITE_sum TCC_ATOMIC_sum TCC_WRITEBACK_sum -pmc: TCC_EA_WRREQ_sum TCC_EA_WRREQ_64B_sum TCC_EA_WR_UNCACHED_32B_sum +pmc: TCC_EA_WRREQ_sum TCC_EA_WRREQ_64B_sum TCC_EA_WR_UNCACHED_32B_sum TCC_EA_WRREQ_DRAM_sum pmc: TCC_EA_WRREQ_STALL_sum TCC_EA_WRREQ_IO_CREDIT_STALL_sum TCC_EA_WRREQ_GMI_CREDIT_STALL_sum TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum -pmc: TCC_TOO_MANY_EA_WRREQS_STALL_sum TCC_EA_ATOMIC_sum TCC_EA_RDREQ_sum TCC_EA_RDREQ_32B_sum -pmc: TCC_EA_RD_UNCACHED_32B_sum TCC_EA_RDREQ_IO_CREDIT_STALL_sum TCC_EA_RDREQ_GMI_CREDIT_STALL_sum TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum +pmc: TCC_EA_RDREQ_sum TCC_EA_RDREQ_32B_sum TCC_EA_RD_UNCACHED_32B_sum TCC_EA_RDREQ_DRAM_sum +pmc: TCC_EA_RDREQ_IO_CREDIT_STALL_sum TCC_EA_RDREQ_GMI_CREDIT_STALL_sum TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum pmc: TCC_TAG_STALL_sum TCC_NORMAL_WRITEBACK_sum TCC_ALL_TC_OP_WB_WRITEBACK_sum TCC_NORMAL_EVICT_sum -pmc: TCC_ALL_TC_OP_INV_EVICT_sum TCC_EA_RDREQ_DRAM_sum TCC_EA_WRREQ_DRAM_sum +pmc: TCC_ALL_TC_OP_INV_EVICT_sum TCC_TOO_MANY_EA_WRREQS_STALL_sum TCC_EA_ATOMIC_sum pmc: TCC_EA_RDREQ_LEVEL_sum TCC_EA_WRREQ_LEVEL_sum TCC_EA_ATOMIC_LEVEL_sum gpu: diff --git a/src/perfmon_pub/mi200/pmc_tcc_perf.txt b/src/perfmon_pub/mi200/pmc_tcc_perf.txt index 8a6d61de4..5586b0d3e 100644 --- a/src/perfmon_pub/mi200/pmc_tcc_perf.txt +++ b/src/perfmon_pub/mi200/pmc_tcc_perf.txt @@ -4,12 +4,12 @@ pmc: TCC_CYCLE_sum TCC_BUSY_sum pmc: TCC_NC_REQ_sum TCC_UC_REQ_sum TCC_CC_REQ_sum TCC_RW_REQ_sum pmc: TCC_REQ_sum TCC_STREAMING_REQ_sum TCC_HIT_sum TCC_MISS_sum pmc: TCC_READ_sum TCC_WRITE_sum TCC_ATOMIC_sum TCC_WRITEBACK_sum -pmc: TCC_EA_WRREQ_sum TCC_EA_WRREQ_64B_sum TCC_EA_WR_UNCACHED_32B_sum +pmc: TCC_EA_WRREQ_sum TCC_EA_WRREQ_64B_sum TCC_EA_WR_UNCACHED_32B_sum TCC_EA_WRREQ_DRAM_sum pmc: TCC_EA_WRREQ_STALL_sum TCC_EA_WRREQ_IO_CREDIT_STALL_sum TCC_EA_WRREQ_GMI_CREDIT_STALL_sum TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum -pmc: TCC_TOO_MANY_EA_WRREQS_STALL_sum TCC_EA_ATOMIC_sum TCC_EA_RDREQ_sum TCC_EA_RDREQ_32B_sum -pmc: TCC_EA_RD_UNCACHED_32B_sum TCC_EA_RDREQ_IO_CREDIT_STALL_sum TCC_EA_RDREQ_GMI_CREDIT_STALL_sum TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum +pmc: TCC_EA_RDREQ_sum TCC_EA_RDREQ_32B_sum TCC_EA_RD_UNCACHED_32B_sum TCC_EA_RDREQ_DRAM_sum +pmc: TCC_EA_RDREQ_IO_CREDIT_STALL_sum TCC_EA_RDREQ_GMI_CREDIT_STALL_sum TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum pmc: TCC_TAG_STALL_sum TCC_NORMAL_WRITEBACK_sum TCC_ALL_TC_OP_WB_WRITEBACK_sum TCC_NORMAL_EVICT_sum -pmc: TCC_ALL_TC_OP_INV_EVICT_sum TCC_EA_RDREQ_DRAM_sum TCC_EA_WRREQ_DRAM_sum +pmc: TCC_ALL_TC_OP_INV_EVICT_sum TCC_TOO_MANY_EA_WRREQS_STALL_sum TCC_EA_ATOMIC_sum pmc: TCC_EA_RDREQ_LEVEL_sum TCC_EA_WRREQ_LEVEL_sum TCC_EA_ATOMIC_LEVEL_sum gpu: diff --git a/src/perfmon_pub/mi50/pmc_tcc_perf.txt b/src/perfmon_pub/mi50/pmc_tcc_perf.txt index dd71aba6a..7e22f0445 100644 --- a/src/perfmon_pub/mi50/pmc_tcc_perf.txt +++ b/src/perfmon_pub/mi50/pmc_tcc_perf.txt @@ -4,12 +4,12 @@ pmc: TCC_CYCLE_sum TCC_BUSY_sum pmc: TCC_NC_REQ_sum TCC_UC_REQ_sum TCC_CC_REQ_sum pmc: TCC_REQ_sum TCC_STREAMING_REQ_sum TCC_HIT_sum TCC_MISS_sum pmc: TCC_READ_sum TCC_WRITE_sum TCC_ATOMIC_sum TCC_WRITEBACK_sum -pmc: TCC_EA_WRREQ_sum TCC_EA_WRREQ_64B_sum TCC_EA_WR_UNCACHED_32B_sum +pmc: TCC_EA_WRREQ_sum TCC_EA_WRREQ_64B_sum TCC_EA_WR_UNCACHED_32B_sum TCC_EA_WRREQ_DRAM_sum pmc: TCC_EA_WRREQ_STALL_sum TCC_EA_WRREQ_IO_CREDIT_STALL_sum TCC_EA_WRREQ_GMI_CREDIT_STALL_sum TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum -pmc: TCC_TOO_MANY_EA_WRREQS_STALL_sum TCC_EA_ATOMIC_sum TCC_EA_RDREQ_sum TCC_EA_RDREQ_32B_sum -pmc: TCC_EA_RD_UNCACHED_32B_sum TCC_EA_RDREQ_IO_CREDIT_STALL_sum TCC_EA_RDREQ_GMI_CREDIT_STALL_sum TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum +pmc: TCC_EA_RDREQ_sum TCC_EA_RDREQ_32B_sum TCC_EA_RD_UNCACHED_32B_sum TCC_EA_RDREQ_DRAM_sum +pmc: TCC_EA_RDREQ_IO_CREDIT_STALL_sum TCC_EA_RDREQ_GMI_CREDIT_STALL_sum TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum pmc: TCC_NORMAL_WRITEBACK_sum TCC_ALL_TC_OP_WB_WRITEBACK_sum TCC_NORMAL_EVICT_sum -pmc: TCC_ALL_TC_OP_INV_EVICT_sum TCC_EA_RDREQ_DRAM_sum TCC_EA_WRREQ_DRAM_sum +pmc: TCC_ALL_TC_OP_INV_EVICT_sum TCC_TOO_MANY_EA_WRREQS_STALL_sum TCC_EA_ATOMIC_sum pmc: TCC_EA_RDREQ_LEVEL_sum TCC_EA_WRREQ_LEVEL_sum TCC_EA_ATOMIC_LEVEL_sum gpu: diff --git a/src/soc_params/mi100.csv b/src/soc_params/mi100.csv index c52a4e1bb..fd0c02cb1 100644 --- a/src/soc_params/mi100.csv +++ b/src/soc_params/mi100.csv @@ -1,2 +1,2 @@ -name,numSE,numCU,numSIMD,numWavesPerCU,numSQC,L2Banks,LDSBanks,Freq,mclk -mi100,8,120,480,40,30,32,32,1502,1200 +name,numSE,numPipes,numCU,numSIMD,numWavesPerCU,numSQC,L2Banks,LDSBanks,Freq,mclk +mi100,8,4,120,480,40,30,32,32,1502,1200 diff --git a/src/soc_params/mi200.csv b/src/soc_params/mi200.csv index bf6343fc0..64faa3c0f 100644 --- a/src/soc_params/mi200.csv +++ b/src/soc_params/mi200.csv @@ -1,2 +1,2 @@ -name,numSE,numCU,numSIMD,numWavesPerCU,numSQC,L2Banks,LDSBanks,Freq,mclk -mi200,8,110,440,32,56,32,32,1700,1600 +name,numSE,numPipes,numCU,numSIMD,numWavesPerCU,numSQC,L2Banks,LDSBanks,Freq,mclk +mi200,8,4,110,440,32,56,32,32,1700,1600 diff --git a/src/soc_params/mi50.csv b/src/soc_params/mi50.csv index f5e1bda0b..de62ad707 100644 --- a/src/soc_params/mi50.csv +++ b/src/soc_params/mi50.csv @@ -1,2 +1,2 @@ -name,numSE,numCU,numSIMD,numWavesPerCU,numSQC,L2Banks,LDSBanks,Freq,mclk -mi50,4,60,240,40,15,16,32,1725,1000 +name,numSE,numPipes,numCU,numSIMD,numWavesPerCU,numSQC,L2Banks,LDSBanks,Freq,mclk +mi50,4,4,60,240,40,15,16,32,1725,1000