diff --git a/AUTHORS b/AUTHORS
index 5e721f8b7..d07fca732 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -9,3 +9,4 @@ Cole Ramos
 Fei Zheng
 Jose Santos
 Karl Schultz
+Nicholas Curtis
\ No newline at end of file
diff --git a/README.md b/README.md
index 8e10e1c1c..1465cf7a0 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,11 @@ contribution process.
 
 * Licensing information can be found in the [LICENSE](LICENSE) file.
 
+## Examples
+A set of guided exercises demonstrating kernel optimization using Omniperf can be found in the [amd/HPCTrainingExamples](https://github.com/amd/HPCTrainingExamples/tree/main/OmniperfExamples) repo.
+
+
+
 ## Development 
 
 Omniperf follows a
diff --git a/sample/common.h b/sample/common.h
new file mode 100644
index 000000000..b6edfeab0
--- /dev/null
+++ b/sample/common.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <iostream>
+
+#define hipCheck(stmt)                                                         \
+  do {                                                                         \
+    hipError_t err = stmt;                                                     \
+    if (err != hipSuccess) {                                                   \
+      char msg[256];                                                           \
+      sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt, __FILE__,   \
+              __FUNCTION__, __LINE__);                                         \
+      std::string errstring = hipGetErrorString(err);                          \
+      std::cerr << msg << "\t" << errstring << std::endl;                      \
+      throw std::runtime_error(msg);                                           \
+    }                                                                          \
+  } while (0)
diff --git a/sample/fabric.hip b/sample/fabric.hip
new file mode 100644
index 000000000..2c1f6b5ff
--- /dev/null
+++ b/sample/fabric.hip
@@ -0,0 +1,315 @@
+/*
+##############################################################################bl
+# MIT License
+#
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+##############################################################################el
+
+
+
+A data-fabric exerciser example, written by Nicholas Curtis [AMD]
+
+The test allows the user to control the:
+  - The granularity of an allocation (Coarse vs Fine-grained),
+  - The owner of an allocation (local HBM, CPU DRAM or remote HBM),
+  - The size of an allocation (the default is ~4GiB), and
+  - The type of operation we are executing (read, write, atomics of various flavors)
+
+This lets the user explore the impact of these choices on the generated
+data-fabric traffic.
+*/
+
+
+#include <getopt.h>
+#include <hip/hip_runtime.h>
+
+#include <iostream>
+#include <vector>
+
+#include "common.h"
+
+enum class mtype : int { FineGrained = 0, CoarseGrained = 1, Undef = 3 };
+enum class mowner : int { Device = 0, Host = 1, Remote = 2, Undef = 3 };
+enum class mspace : int { Global = 0, Undef = 1 };
+enum class mop : int {
+  Read = 0,
+  Write = 1,
+  AtomicAdd = 2,
+  AtomicCas = 3,
+  AtomicOr = 4,
+  AtomicMax = 5,
+  Undef = 6
+};
+enum class mdata : int { Unsigned = 0, UnsignedLong = 1, Float = 2, Double = 3, Undef = 4 };
+
+template<typename T>
+T parse(const char* value) {
+  int ivalue = std::atoi(value);
+  if (ivalue < 0 || ivalue >= int(T::Undef)) {
+    throw std::runtime_error("bad enum value!");
+  }
+  return T(ivalue);
+}
+
+void parse(int argc, char** argv, mtype& mytype, mowner& myowner,
+           mspace& myspace, size_t& size, mop& myop, mdata& mydata,
+           int& remoteId) {
+  while (1) {
+    static struct option long_options[] = {
+        /* These options set a flag. */
+        {"type", required_argument, 0, 't'},
+        {"owner", required_argument, 0, 'o'},
+        {"size", required_argument, 0, 'z'},
+        {"op", required_argument, 0, 'p'},
+        {"remote", required_argument, 0, 'r'},
+        {"data", required_argument, 0, 'd'},
+        {0, 0, 0, 0}};
+    /* getopt_long stores the option index here. */
+    int option_index = 0;
+
+    int c =
+        getopt_long(argc, argv, "t:o:z:p:r:d:", long_options, &option_index);
+
+    /* Detect the end of the options. */
+    if (c == -1) break;
+
+    switch (c) {
+      case 't':
+        mytype = parse<mtype>(optarg);
+        break;
+
+      case 'o':
+        myowner = parse<mowner>(optarg);
+        break;
+
+      case 'z':
+        size = std::atoll(optarg);
+        break;
+
+      case 'p':
+        myop = parse<mop>(optarg);
+        break;
+
+      case 'r':
+        remoteId = std::atoi(optarg);
+        break;
+
+      case 'd':
+        mydata = parse<mdata>(optarg);
+        break;
+
+      case '?':
+        /* getopt_long already printed an error message. */
+        break;
+
+      default:
+        abort();
+    }
+  }
+  std::cout << "Using: " << std::endl;
+  std::cout << "\tmtype:"
+            << ((mytype == mtype::FineGrained) ? "FineGrained"
+                                               : "CoarseGrained")
+            << std::endl;
+  std::cout << "\tmowner:"
+            << ((myowner == mowner::Device)
+                    ? "Device"
+                    : ((myowner == mowner::Host) ? "Host" : "Remote"))
+            << std::endl;
+  std::cout << "\tmspace:Global" << std::endl;
+  std::cout << "\tmop:" << ((myop == mop::Read) ? "Read" : (myop == mop::Write ? "Write" : (myop == mop::AtomicAdd ? "Add" : (myop == mop::AtomicCas ? "CAS" : (myop == mop::AtomicOr ? "Or" : "Max"))))) << std::endl;
+  std::cout << "\tmdata:" << (mydata == mdata::Unsigned ? "Unsigned" : (mydata == mdata::UnsignedLong ? "Unsigned Long" : (mydata == mdata::Float ? "Float" : "Double"))) << std::endl;
+  std::cout << "\tremoteId:" << remoteId << std::endl;
+}
+
+// dummy intialization kernel
+__global__ void init() {}
+
+template <typename T>
+void alloc(mtype memory, mowner owner, T** ptr, size_t Nbytes, int devId,
+           int remoteId) {
+  bool is_device = (owner == mowner::Device) || (owner == mowner::Remote);
+  if (owner == mowner::Remote) {
+    // enable remote access
+    hipCheck(hipDeviceEnablePeerAccess(remoteId, 0));
+    // set id for alloc
+    hipCheck(hipSetDevice(remoteId));
+  }
+  init<<<1, 1>>>();
+
+  if (memory == mtype::FineGrained && is_device) {
+    hipCheck(
+        hipExtMallocWithFlags((void**)ptr, Nbytes, hipDeviceMallocFinegrained));
+  } else if (memory == mtype::CoarseGrained && is_device) {
+    hipCheck(hipMalloc(ptr, Nbytes));
+  } else if (memory == mtype::FineGrained && owner == mowner::Host) {
+    hipCheck(hipHostMalloc(ptr, Nbytes, hipHostMallocCoherent));
+  } else if (memory == mtype::CoarseGrained && owner == mowner::Host) {
+    hipCheck(hipHostMalloc(ptr, Nbytes, hipHostMallocNonCoherent));
+  } else {
+    assert(false && "unknown combo");
+  }
+
+  // set to random
+  std::vector<T> host(Nbytes / sizeof(T), T(0));
+  hipCheck(hipMemcpy(*ptr, &host[0], Nbytes,
+                  (is_device ? hipMemcpyHostToDevice : hipMemcpyHostToHost)));
+
+  if (owner == mowner::Remote) {
+    // reset id for execution
+    hipCheck(hipSetDevice(devId));
+  }
+}
+
+template <typename T>
+void release(mtype memory, mowner owner, T* ptr) {
+  bool is_device = (owner == mowner::Device) || (owner == mowner::Remote);
+  if (memory == mtype::FineGrained && is_device) {
+    hipCheck(hipFree(ptr));
+  } else if (memory == mtype::CoarseGrained && is_device) {
+    hipCheck(hipFree(ptr));
+  } else if (memory == mtype::FineGrained && owner == mowner::Host) {
+    hipCheck(hipHostFree(ptr));
+  } else if (memory == mtype::CoarseGrained && owner == mowner::Host) {
+    hipCheck(hipHostFree(ptr));
+  } else {
+    assert(false && "unknown combo");
+  }
+}
+
+// the main streaming kernel
+template <mop op, typename T, int repeats = 10>
+__global__ void kernel(T* x, size_t N, T zero, T foo) {
+  int sum = 0;
+  const size_t offset_start = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = 0; i < repeats; ++i) {
+    for (size_t offset = offset_start; offset < N;
+         offset += blockDim.x * gridDim.x) {
+      T uniq = (foo + offset) + i;
+      if constexpr (op == mop::Read) {
+        sum += x[offset];
+      } else if constexpr (op == mop::Write) {
+        x[offset] = (T)offset;
+      } else if constexpr (op == mop::AtomicAdd) {
+        atomicAdd(&x[offset], uniq);
+      } else if constexpr (op == mop::AtomicCas) {
+        atomicCAS(&x[offset], uniq, uniq);
+      } else if constexpr (op == mop::AtomicOr) {
+        atomicOr(&x[offset], uniq);
+      } else if constexpr (op == mop::AtomicMax) {
+        atomicMax(&x[offset], uniq);
+      }
+    }
+  }
+  if constexpr (op == mop::Read) {
+    if (sum != 0) {
+      x[offset_start] = sum;
+    }
+  }
+}
+
+template <mop op, typename T, int nrepeats = 10>
+void run_kernel(T* x, size_t size) {
+  if constexpr (op == mop::AtomicOr && std::is_floating_point_v<T>) {
+    throw std::runtime_error("bad");
+  } else {
+    kernel<op, T, nrepeats><<<4096, 1024>>>(x, size, 0, T(23456789));
+    // then run once for data collection
+    kernel<op, T, nrepeats><<<4096, 1024>>>(x, size, 0, T(23456789));
+  }
+}
+
+template <mop op, typename T>
+void run_atomic(mowner myowner, T* x, size_t size) {
+  if (myowner == mowner::Host) {
+    // speed it up
+    run_kernel<op, T, 1>(x, size / 10);
+  } else {
+    run_kernel<op, T>(x, size);
+  }
+}
+
+template <typename T>
+void run(mtype mytype, mspace myspace, mowner myowner, mop myop, int remoteId,
+         size_t size) {
+  int devId = 0;
+  if (myowner == mowner::Remote && remoteId == -1) {
+    // need to find a remote GPU
+    int ndevices;
+    hipCheck(hipGetDeviceCount(&ndevices));
+    if (ndevices <= 1) {
+      throw std::runtime_error(
+          "Need >=2 devices available for mowner = Remote");
+    }
+    for (int i = 0; i < ndevices; ++i) {
+      if (i != devId) {
+        remoteId = i;
+        break;
+      }
+    }
+  }
+
+  T* x;
+  alloc(mytype, myowner, &x, size * sizeof(T), devId, remoteId);
+
+  // run the kernel once for warmup
+  assert(4096 * 1024 < size);
+  if (myop == mop::Read) {
+    run_kernel<mop::Read>(x, size);
+  } else if (myop == mop::Write) {
+    run_kernel<mop::Write>(x, size);
+  } else if (myop == mop::AtomicAdd) {
+    run_atomic<mop::AtomicAdd>(myowner, x, size);
+  } else if (myop == mop::AtomicCas) {
+    run_atomic<mop::AtomicCas>(myowner, x, size);
+  } else if (myop == mop::AtomicOr) {
+    run_atomic<mop::AtomicOr>(myowner, x, size);
+  } else if (myop == mop::AtomicMax) {
+    run_atomic<mop::AtomicMax>(myowner, x, size);
+  } else {
+    throw std::runtime_error("bad");
+  }
+  hipCheck(hipDeviceSynchronize());
+  release(mytype, myowner, x);
+}
+
+int main(int argc, char** argv) {
+  mtype mytype = (mtype)0;
+  mspace myspace = (mspace)0;
+  mowner myowner = (mowner)0;
+  mop myop = (mop)0;
+  mdata mydata = (mdata)0;
+  int remoteId = -1;
+  size_t size = 1024ull * 1024ull *
+                1024ull;  // 4 GiB, purposefully much larger than caches.
+  parse(argc, argv, mytype, myowner, myspace, size, myop, mydata, remoteId);
+  if (mydata == mdata::Unsigned)
+    run<unsigned>(mytype, myspace, myowner, myop, remoteId, size);
+  else if (mydata == mdata::UnsignedLong)
+    run<unsigned long>(mytype, myspace, myowner, myop, remoteId, size);
+  else if (mydata == mdata::Float)
+    run<float>(mytype, myspace, myowner, myop, remoteId, size);
+  else if (mydata == mdata::Double)
+    run<double>(mytype, myspace, myowner, myop, remoteId, size);
+  else {
+    throw std::runtime_error("bad");
+  }
+}
\ No newline at end of file
diff --git a/sample/instmix.hip b/sample/instmix.hip
new file mode 100644
index 000000000..a409db4b0
--- /dev/null
+++ b/sample/instmix.hip
@@ -0,0 +1,113 @@
+/*
+##############################################################################bl
+# MIT License
+#
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+##############################################################################el
+
+
+
+A instruction mix exerciser example, written by Gina Sitaraman and Nicholas Curtis [AMD].
+Although inline assembly is inherently unportable, this is expected to work on all CDNA accelerators.
+*/
+
+
+#include "common.h"
+
+__global__ void kernelasm() {
+  // int32
+  int i, j;
+  asm volatile("v_add_u32_e32 %0, %1, %0\n" : "=v"(j) : "v"(i));
+
+  // int 64
+  long int l1, l2;
+  asm volatile("v_cmp_eq_i64 %0, %1\n" : "=v"(l2) : "v"(l1), "v"(i));
+
+  // fp32: add, mul, transcendental and fma
+  float f1, f2;
+  asm volatile(
+      "v_add_f32_e32 %0, %1, %0\n"
+      "v_mul_f32_e32 %0, %1, %0\n"
+      "v_sqrt_f32 %0, %1\n"
+      "v_fma_f32 %0, %1, %0, %1\n"
+      : "=v"(f1)
+      : "v"(f2));
+
+  // fp64: add, mul, transcendental and fma
+  double d1, d2, d3, d4;
+  asm volatile(
+      "v_add_f64 %0, %1, %0\n"
+      "v_mul_f64 %0, %1, %0\n"
+      "v_fma_f64 %0, %1, %0, %1\n"
+      "v_sqrt_f64 %0, %1\n"
+      "v_min_f64 %0, %1, %0\n"
+      : "+v"(d1)
+      : "v"(d2));
+
+  // fp16: add, mul, transcendental and fma
+  _Float16 h1, h2;
+  asm volatile(
+      "v_add_f16_e32 %0, %1, %0\n"
+      "v_mul_f16_e32 %0, %1, %0\n"
+      "v_sqrt_f16 %0, %1\n"
+      "v_cvt_f16_f32 %0 %2\n"
+      "v_fma_f16 %0, %1, %0, %0\n"
+      : "=v"(h2)
+      : "v"(h1), "v"(f1));
+
+  // MFMA  ops
+  double2 dd;
+  unsigned short us;
+  long2 ll;
+#if defined(__gfx90a__)
+  asm volatile("v_mfma_f64_4x4x4f64 %0 %1 %2 %3\n"
+               : "=v"(d4)
+               : "v"(d1), "v"(d2), "v"(d3));
+  asm volatile("v_mfma_f32_16x16x4f32 %0 %1 %2 1\n"
+               : "=v"(dd)
+               : "v"(f1), "v"(f2));
+  asm volatile("v_mfma_f32_16x16x16f16 %0 %1 %2 1\n"
+               : "=v"(dd)
+               : "v"(d1), "v"(d2));
+  asm volatile("v_mfma_f32_16x16x8bf16 %0 %1 %2 1\n"
+               : "=v"(dd)
+               : "v"(f1), "v"(f2));
+  asm volatile("v_mfma_i32_16x16x16i8 %0 %1 %2 1\n"
+               : "=v"(ll)
+               : "v"(i), "v"(j));
+#endif
+
+  // Scalar op
+  asm volatile("s_add_i32 %0 %1 %0\n" : "=s"(j) : "s"(i));
+
+  // LDS
+  asm volatile("ds_read_b32 %0 %0\n" : "=v"(i) : "v"(j));
+
+  // Branch
+  asm volatile(
+      "s_branch .LDUMMY\n"
+      ".LDUMMY:\n"
+      "s_endpgm\n");
+}
+int main() {
+  kernelasm<<<1, 64>>>();
+  hipCheck(hipDeviceSynchronize());
+}
diff --git a/sample/ipc.hip b/sample/ipc.hip
new file mode 100644
index 000000000..9fcdf462d
--- /dev/null
+++ b/sample/ipc.hip
@@ -0,0 +1,127 @@
+/*
+##############################################################################bl
+# MIT License
+#
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+##############################################################################el
+
+
+
+An example to explore IPC and divergence, written by Nicholas Curtis [AMD].
+This example may not work on all CDNA accelerators, but has been verified on MI2XX.
+*/
+
+#include "common.h"
+
+template<int N=1000>
+__device__ void vrcp_op() {
+    int dummy;
+    if constexpr (N >= 1) {
+        asm volatile("v_rcp_f64 v[0:1], v[0:1]\n" : : "{v31}"(dummy));
+        vrcp_op<N - 1>();
+    }
+}
+
+template<int N=1000>
+__global__ void vrcp() {
+    vrcp_op<N>();
+}
+
+template<int N=1000>
+__device__ void vmov_op() {
+    int dummy;
+    if constexpr (N >= 1) {
+        asm volatile("v_mov_b32 v0, v1\n" : : "{v31}"(dummy));
+        vmov_op<N - 1>();
+    }
+}
+
+template<int N=1000>
+__global__ void vmov() {
+    vmov_op<N>();
+}
+
+template<int N=1000>
+__device__ void mfma_op() {
+    int dummy;
+    if constexpr (N >= 1) {
+        asm volatile("v_mfma_f32_32x32x8bf16_1k v[0:15], v[16:17], v[18:19], v[0:15]\n" : : "{v31}"(dummy));
+        mfma_op<N - 1>();
+    }
+}
+
+template<int N=1000>
+__global__ void mfma() {
+    mfma_op<N>();
+}
+
+template<int N=1000>
+__device__ void snop_op() {
+    int dummy;
+    if constexpr (N >= 1) {
+        asm volatile("s_nop 0x0\n" : : "{v31}"(dummy));
+        snop_op<N - 1>();
+    }
+}
+
+
+template<int N=1000>
+__global__ void snop() {
+    snop_op<N>();
+}
+
+template<int N=1000>
+__device__ void smov_op() {
+    int dummy;
+    if constexpr (N >= 1) {
+        asm volatile("s_mov_b32 s0, s1\n" : : "{s31}"(dummy));
+        smov_op<N - 1>();
+    }
+}
+
+template<int N=1000>
+__global__ void smov() {
+    smov_op<N>();
+}
+
+template<int N=1000>
+__global__ void vmov_with_divergence() {
+    if (threadIdx.x % 64 == 0)
+        vmov_op<N>();
+}
+
+int main() {
+    // warmups, spam to all CUs
+    vrcp<<<1024 * 1024, 1024>>>();
+    vmov<<<1024 * 1024, 1024>>>();
+    mfma<<<1024 * 1024, 1024>>>();
+    snop<<<1024 * 1024, 1024>>>();
+    smov<<<1024 * 1024, 1024>>>();
+    vmov_with_divergence<<<1024 * 1024, 1024>>>();
+    hipCheck(hipDeviceSynchronize());
+    vrcp<<<1024 * 1024, 1024>>>();
+    vmov<<<1024 * 1024, 1024>>>();
+    mfma<<<1024 * 1024, 1024>>>();
+    snop<<<1024 * 1024, 1024>>>();
+    smov<<<1024 * 1024, 1024>>>();
+    vmov_with_divergence<<<1024 * 1024, 1024>>>();
+    hipCheck(hipDeviceSynchronize());
+}
\ No newline at end of file
diff --git a/sample/lds.hip b/sample/lds.hip
new file mode 100644
index 000000000..2018ad8da
--- /dev/null
+++ b/sample/lds.hip
@@ -0,0 +1,78 @@
+/*
+##############################################################################bl
+# MIT License
+#
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+##############################################################################el
+
+
+
+An example to explore LDS bandwidth and bank conflicts, written by Nicholas Curtis [AMD].
+*/
+
+
+#include "common.h"
+
+constexpr unsigned max_threads = 256;
+constexpr unsigned nbanks = 32;
+
+__global__ void load(int* out, int flag) {
+  __shared__ int array[max_threads];
+  int index = threadIdx.x;
+  // fake a store to the LDS array to avoid unwanted behavior
+  if (flag)
+    array[max_threads - index] = index;
+  __syncthreads();
+  int x = array[index];
+  if (x == int(-1234567))
+    out[threadIdx.x] = x;
+}
+
+__global__ void conflicts(int* out, int flag) {
+  constexpr unsigned nelements = nbanks * max_threads;
+  __shared__ int array[nelements];
+  // each thread reads from the same bank
+  int index = threadIdx.x * nbanks;
+  // fake a store to the LDS array to avoid unwanted behavior
+  if (flag)
+    array[max_threads - index] = index;
+  __syncthreads();
+  int x = array[index];
+  if (x == int(-1234567))
+    out[threadIdx.x] = x;
+}
+
+void bandwidth_demo(int N) {
+  for (int i = 1; i <= N; ++i)
+    load<<<1,i>>>(nullptr, 0);
+  hipCheck(hipDeviceSynchronize());
+}
+
+void conflicts_demo(int N) {
+  for (int i = 1; i <= N; ++i)
+    conflicts<<<1,i>>>(nullptr, 0);
+  hipCheck(hipDeviceSynchronize());
+}
+
+int main() {
+  bandwidth_demo(max_threads);
+  conflicts_demo(max_threads);
+}
diff --git a/sample/occupancy.hip b/sample/occupancy.hip
new file mode 100644
index 000000000..7c7099e30
--- /dev/null
+++ b/sample/occupancy.hip
@@ -0,0 +1,109 @@
+/*
+##############################################################################bl
+# MIT License
+#
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+##############################################################################el
+
+
+
+An example to explore achieved occupancy, and various occupancy limiters.
+Written by Nicholas Curtis [AMD].
+*/
+
+
+#include "common.h"
+
+__global__ void empty(int N, double* ptr) {
+
+}
+
+constexpr int bound = 16;
+__launch_bounds__(256)
+__global__ void vgprbound(int N, double* ptr) {
+    double intermediates[bound];
+    for (int i = 0 ; i < bound; ++i) intermediates[i] = N * threadIdx.x;
+    double x = ptr[threadIdx.x];
+    for (int i = 0; i < 100; ++i) {
+        x += sin(pow(__shfl(x, i % warpSize) * intermediates[(i - 1) % bound], intermediates[i % bound]));
+        intermediates[i % bound] = x;
+    }
+    if (x == N) ptr[threadIdx.x] = x;
+}
+
+constexpr size_t fully_allocate_lds = 64ul * 1024ul / sizeof(double);
+__launch_bounds__(256)
+__global__ void ldsbound(int N, double* ptr) {
+    __shared__ double intermediates[fully_allocate_lds];
+    for (int i = threadIdx.x ; i < fully_allocate_lds; i += blockDim.x) intermediates[i] = N * threadIdx.x;
+    __syncthreads();
+    double x = ptr[threadIdx.x];
+    for (int i = threadIdx.x; i < fully_allocate_lds; i += blockDim.x) {
+        x += sin(pow(__shfl(x, i % warpSize) * intermediates[(i - 1) % fully_allocate_lds], intermediates[i % fully_allocate_lds]));
+        __syncthreads();
+        intermediates[i % fully_allocate_lds] = x;
+    }
+    if (x == N) ptr[threadIdx.x] = x;
+}
+
+constexpr int sgprlim = 1;
+__launch_bounds__(1024, 8)
+__global__ void sgprbound(int N, double* ptr) {
+    double intermediates[sgprlim];
+    for (int i = 0 ; i < sgprlim; ++i) intermediates[i] = i;
+    double x = ptr[0];
+    #pragma unroll 1
+    for (int i = 0; i < 100; ++i) {
+        x += sin(pow(intermediates[(i - 1) % sgprlim], intermediates[i % sgprlim]));
+        intermediates[i % sgprlim] = x;
+    }
+    if (x == N) ptr[0] = x;
+}
+
+int main() {
+    double* ptr;
+    hipCheck(hipMalloc(&ptr, 1024 * sizeof(double)));
+    vgprbound<<<1024 * 1024, 256>>>(0, ptr);
+    hipCheck(hipGetLastError());
+    hipCheck(hipDeviceSynchronize());
+    vgprbound<<<1024 * 1024, 256>>>(0, ptr);
+    hipCheck(hipGetLastError());
+    hipCheck(hipDeviceSynchronize());
+    ldsbound<<<1024 * 1024, 256>>>(0, ptr);
+    hipCheck(hipGetLastError());
+    hipCheck(hipDeviceSynchronize());
+    ldsbound<<<1024 * 1024, 256>>>(0, ptr);
+    hipCheck(hipGetLastError());
+    hipCheck(hipDeviceSynchronize());
+    sgprbound<<<1024 * 1024, 256>>>(0, ptr);
+    hipCheck(hipGetLastError());
+    hipCheck(hipDeviceSynchronize());
+    sgprbound<<<1024 * 1024, 256>>>(0, ptr);
+    hipCheck(hipGetLastError());
+    hipCheck(hipDeviceSynchronize());
+    empty<<<1024 * 1024, 256>>>(0, ptr);
+    hipCheck(hipGetLastError());
+    hipCheck(hipDeviceSynchronize());
+    empty<<<1024 * 1024, 256>>>(0, ptr);
+    hipCheck(hipGetLastError());
+    hipCheck(hipDeviceSynchronize());
+    hipCheck(hipFree(ptr));
+}
diff --git a/sample/stack.hip b/sample/stack.hip
new file mode 100644
index 000000000..9f030309a
--- /dev/null
+++ b/sample/stack.hip
@@ -0,0 +1,43 @@
+/*
+##############################################################################bl
+# MIT License
+#
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+##############################################################################el
+
+
+
+An example to explore spill/stack instructions.
+Written by Nicholas Curtis [AMD].
+*/
+
+#include "common.h"
+
+__global__ void knl(int* out, int filter) {
+  int x[1024];
+  x[filter] = 0;
+  if (threadIdx.x < filter) out[threadIdx.x] = x[threadIdx.x];
+}
+
+int main() {
+  knl<<<1, 1>>>(nullptr, 0);
+  hipCheck(hipDeviceSynchronize());
+}
\ No newline at end of file
diff --git a/sample/vcopy.cpp b/sample/vcopy.cpp
index 0eed48711..88fdff22e 100644
--- a/sample/vcopy.cpp
+++ b/sample/vcopy.cpp
@@ -1,3 +1,29 @@
+/*
+##############################################################################bl
+# MIT License
+#
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+##############################################################################el
+*/
+
 #include "hip/hip_runtime.h"
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/sample/vmem.hip b/sample/vmem.hip
new file mode 100644
index 000000000..e85d1baa5
--- /dev/null
+++ b/sample/vmem.hip
@@ -0,0 +1,98 @@
+/*
+##############################################################################bl
+# MIT License
+#
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+##############################################################################el
+
+
+
+An example to explore global/generic instructions.
+Written by Nicholas Curtis [AMD].
+*/
+
+#include "common.h"
+
+typedef int __attribute__((address_space(0)))* generic_ptr;
+
+__attribute__((noinline)) __device__ void generic_store(generic_ptr ptr, int zero) { *ptr = zero; }
+__attribute__((noinline)) __device__ int generic_load(generic_ptr ptr) { return *ptr; }
+__attribute__((noinline)) __device__ void generic_atomic(generic_ptr ptr, int zero) { atomicAdd((int*)ptr, zero); }
+
+__global__ void global_write(int* ptr, int zero) {
+  ptr[threadIdx.x] = zero;
+}
+
+__global__ void generic_write(int* ptr, int zero, int filter) {
+  __shared__ int lds[1024];
+  int* generic = (threadIdx.x < filter) ? &ptr[threadIdx.x] : &lds[threadIdx.x];
+  generic_store((generic_ptr)generic, zero);
+}
+
+__global__ void global_read(int* ptr, int zero) {
+  int x = ptr[threadIdx.x];
+  if (x != zero) {
+    ptr[threadIdx.x] = x + 1;
+  }
+}
+
+__global__ void generic_read(int* ptr, int zero, int filter) {
+  __shared__ int lds[1024];
+  if (static_cast<int>(filter - 1) == zero) {
+    lds[threadIdx.x] = 0; // initialize to zero to avoid conditional, but hide behind _another_ conditional
+  }
+  int* generic;
+  if (static_cast<int>(threadIdx.x) > filter - 1) {
+    generic = &ptr[threadIdx.x];
+  } else {
+    generic = &lds[threadIdx.x];
+    abort();
+  }
+  int x = generic_load((generic_ptr)generic);
+  if (x != zero) {
+    ptr[threadIdx.x] = x + 1;
+  }
+}
+
+
+__global__ void global_atomic(int* ptr, int zero) {
+  atomicAdd(ptr, zero);
+}
+
+__global__ void generic_atomic(int* ptr, int filter, int zero) {
+  __shared__ int lds[1024];
+  int* generic = (threadIdx.x % 2 == filter) ? &ptr[threadIdx.x] : &lds[threadIdx.x];
+  generic_atomic((generic_ptr)generic, zero);
+}
+
+int main() {
+  int* ptr;
+  hipCheck(hipMalloc(&ptr, sizeof(int)));
+  hipCheck(hipMemset(ptr, 0, sizeof(int)));
+  global_write<<<1,1>>>(ptr, 0);
+  generic_write<<<1,1>>>(ptr, 0, 0);
+  global_read<<<1,1>>>(ptr, 0);
+  generic_read<<<1,1>>>(ptr, 0, 0);
+  global_atomic<<<1,1>>>(ptr, 0);
+  generic_atomic<<<1,1>>>(ptr, 0, 0);
+  hipCheck(hipDeviceSynchronize());
+  hipCheck(hipFree(ptr));
+}
diff --git a/src/docs/analysis.md b/src/docs/analysis.md
index 9b68249c4..1997563c3 100644
--- a/src/docs/analysis.md
+++ b/src/docs/analysis.md
@@ -5,7 +5,7 @@
    :glob:
    :maxdepth: 4
 ```
-Omniperf offers several ways to interact with the metrics it generates from profiling. The option you choose will likey be influnced by your familiarity with the profiled application, computing enviroment, and experience with Omniperf.
+Omniperf offers several ways to interact with the metrics it generates from profiling. The option you choose will likely be influnced by your familiarity with the profiled application, computing enviroment, and experience with Omniperf.
 
 While analyzing with the CLI offers quick and straightforward access to Omniperf metrics from terminal, the GUI adds an extra layer of styling and interactiveness some users may prefer.
 
@@ -16,7 +16,7 @@ See sections below for more information on each.
 
 ### Features
 
-- All Omniperf built-in metrics.
+- All of Omniperf's built-in metrics.
 - Multiple runs base line comparison.
 - Metrics customization: pick up subset of build-in metrics or build your own profiling configuration.
 - Kernel, gpu-id, dispatch-id filters.
@@ -107,7 +107,7 @@ Analyze
 2. System Speed-of-Light
 ....
 ```
- 2. Use `--list-metrics` to generate a list of availible metrics for inspection
+ 2. Use `--list-metrics` to generate a list of available metrics for inspection
  ```shell-session
 $ omniperf analyze -p workloads/vcopy/mi200/ --list-metrics gfx90a
 ╒═════════╤═════════════════════════════╕
@@ -254,7 +254,7 @@ Analyze
 ```
 > **Note:** Some cells may be blank indicating a missing/unavailable hardware counter or NULL value
 
-3. Optimizatize application, iterate, and re-profile to inspect performance changes.
+3. Optimize application, iterate, and re-profile to inspect performance changes.
 4. Redo a comprehensive analysis with Omniperf CLI at any milestone or at the end.
 
 ### Demo
@@ -281,7 +281,7 @@ Analyze
   $ omniperf analyze -p workloads/vcopy/mi200/  -b 2  5.1.0
   ```
 
-  > Note: Users can filter single metric or the whole IP block by its id. In this case, 1 is the id for "system speed of light" and 5.1.0 the id for metric "GPU Busy Cycles".
+  > Note: Users can filter single metric or the whole hardware component by its id. In this case, 1 is the id for "system speed of light" and 5.1.0 the id for metric "GPU Busy Cycles".
 
 - Filter kernels
 
@@ -304,7 +304,7 @@ Analyze
 
   ```
 
-  Second, select the index of the kernel you'd like to filter (i.e. __vecCopy(double*, double*, double*, int, int) [clone .kd]__ at index __0__). Then, use this index to apply the filter via `-k/--kernels`.
+  Second, select the index of the kernel you would like to filter (i.e. __vecCopy(double*, double*, double*, int, int) [clone .kd]__ at index __0__). Then, use this index to apply the filter via `-k/--kernels`.
 
   ```shell-session
   $ omniperf -p workloads/vcopy/mi200/ -k 0
@@ -325,7 +325,7 @@ Analyze
   ... ...
   ```
   
-  > Note: You'll see your filtered kernel(s) indicated by a asterisk in the Top Stats table
+  > Note: You will see your filtered kernel(s) indicated by an asterisk in the Top Stats table
 
 
 - Baseline comparison
@@ -333,7 +333,7 @@ Analyze
   ```shell
   omniperf analyze -p workload1/path/  -p workload2/path/
   ```
-  > Note: You can also apply diffrent filters to each workload.
+  > Note: You can also apply different filters to each workload.
   
   OR
   ```shell
@@ -414,7 +414,7 @@ When no filters are applied, users will see five basic sections derived from the
 
 To dive deeper, use the top drop down menus to isolate particular
 kernel(s) or dispatch(s). You will then see the web page update with
-metrics specific to the filter you've applied.
+metrics specific to the filter you have applied.
 
 Once you have applied a filter, you will also see several additional
 sections become available with detailed metrics specific to that area
@@ -427,22 +427,22 @@ interface](https://amdresearch.github.io/omniperf/analysis.html#grafana-based-gu
 #### Features
 The Omniperf Grafana GUI Analyzer supports the following features to facilitate MI GPU performance profiling and analysis:
 
-- System and IP-Block Speed-of-Light (SOL)
+- System and Hardware Component (IP Block) Speed-of-Light (SOL)
 - Multiple normalization options, including per-cycle, per-wave, per-kernel and per-second.
 - Baseline comparisons 
 - Regex based Dispatch ID filtering
 - Roofline Analysis
-- Detailed per IP Block performance counters and metrics
-  - CPC/CPF
-  - SPI
-  - SQ
-  - SQC
-  - TA/TD
-  - TCP
-  - TCC (both aggregated and per-channel perf info)
+- Detailed performance counters and metrics per hardware component, e.g.,
+  - Command Processor - Fetch (CPF) / Command Processor - Controller (CPC)
+  - Workgroup Manager (SPI)
+  - Shader Sequencer (SQ)
+  - Shader Sequencer Controller (SQC)
+  - L1 Address Processing Unit, a.k.a. Texture Addresser (TA) / L1 Backend Data Processing Unit, a.k.a. Texture Data (TD)
+  - L1 Cache (TCP)
+  - L2 Cache (TCC) (both aggregated and per-channel perf info)
 
 ##### Speed-of-Light
-Speed-of-light panels are provided at both the system and per IP block level to help diagnosis performance bottlenecks. The performance numbers of the workload under testing are compared to the theoretical maximum, (e.g. floating point operations, bandwidth, cache hit rate, etc.), to indicate the available room to further utilize the hardware capability.
+Speed-of-light panels are provided at both the system and per hardware component level to help diagnosis performance bottlenecks. The performance numbers of the workload under testing are compared to the theoretical maximum, (e.g. floating point operations, bandwidth, cache hit rate, etc.), to indicate the available room to further utilize the hardware capability.
 
 ##### Multi Normalization
 
@@ -457,24 +457,24 @@ Omniperf enables baseline comparison to allow checking A/B effect. The current r
 
 For both the Current Workload and the Baseline Workload, one can independently setup the following filters to allow fine grained comparions:
 - Workload Name 
-- GPU ID filtering (multi selection)
-- Kernel Name filtering (multi selection)
+- GPU ID filtering (multi-selection)
+- Kernel Name filtering (multi-selection)
 - Dispatch ID filtering (Regex filtering)
-- Omniperf Panels (multi selection)
+- Omniperf Panels (multi-selection)
 
 ##### Regex based Dispatch ID filtering
-This release enables regex based dispatch ID filtering to flexibly choose the kernel invocations. One may refer to [Regex Numeric Range Generator](https://3widgets.com/), to generate typical number ranges. 
+This release enables Regular Expression (regex), a standard Linux string matching syntax, based dispatch ID filtering to flexibly choose the kernel invocations. One may refer to [Regex Numeric Range Generator](https://3widgets.com/), to generate typical number ranges.
 
-For example, if one wants to inspect Dispatch Range from 17 to 48, inclusive, the corresponding regex is : **(1[7-9]|[23]\d|4[0-8])**. The generated express can be copied over for filtering.
+For example, if one wants to inspect Dispatch Range from 17 to 48, inclusive, the corresponding regex is : **(1[7-9]|[23]\d|4[0-8])**. The generated expression can be copied over for filtering.
 
 ##### Incremental Profiling
 Omniperf supports incremental profiling to significantly speed up performance analysis.
 
-> Refer to [*IP Block profiling*](https://amdresearch.github.io/omniperf/profiling.html#ip-block-profiling) section for this command. 
+> Refer to [*Hardware Component Filtering*](https://amdresearch.github.io/omniperf/profiling.html#hardware-component-filtering) section for this command.
 
-By default, the entire application is profiled to collect perfmon counter for all IP blocks, giving a system level view of where the workload stands in terms of performance optimization opportunities and bottlenecks. 
+By default, the entire application is profiled to collect performance counters for all hardware blocks, giving a complete view of where the workload stands in terms of performance optimization opportunities and bottlenecks. 
 
-After that one may focus on only a few IP blocks, (e.g., L1 Cache or LDS) to closely check the effect of software optimizations, without performing application replay for all other IP Blocks. This saves lots of compute time. In addition, the prior profiling results for other IP blocks are not overwritten. Instead, they can be merged during the import to piece together the system view. 
+After that one may focus on only a few hardware components, (e.g., L1 Cache or LDS) to closely check the effect of software optimizations, without performing application replay for all other hardware components. This saves lots of compute time. In addition, the prior profiling results for other hardware components are not overwritten. Instead, they can be merged during the import to piece together the system view.
 
 ##### Color Coding
 The uniform color coding is applied to most visualizations (bars, table, diagrams etc). Typically, Yellow color means over 50%, while Red color mean over 90% percent, for easy inspection.
@@ -594,7 +594,7 @@ There are currently 18 main panel categories available for analyzing the compute
 - Command Processor
   - Command Processor - Fetch (CPF)
   - Command Processor - Controller (CPC)
-- Shader Processing Input (SPI)
+- Workgroup Manager or Shader Processor Input (SPI)
   - SPI Stats
   - SPI Resource Allocations
 - Wavefront Launch
@@ -655,116 +655,357 @@ There are currently 18 main panel categories available for analyzing the compute
   - Per-channel L2-EA Read stall (I/O, GMI, HBM)
   - Per-channel L2-EA Write stall (I/O, GMI, HBM, Starve)
 
-Most panels are designed around a specific IP block to thoroughly understand its behavior. Additional panels, including custom panels, could also be added to aid the performance analysis.
+Most panels are designed around a specific hardware component block to thoroughly understand its behavior. Additional panels, including custom panels, could also be added to aid the performance analysis.
 
 ##### System Info Panel
-![System Info Panel](images/System_info_panel.png)
+``` {figure} images/system-info_panel.png
+:alt: System Info
+:figclass: figure
+:align: center
+
+System details logged from host machine.
+```
+
 ##### Kernel Statistics
 
 ###### Kernel Time Histogram
-![Kernel Time Histogram](images/Kernel_time_histogram.png)
+``` {figure} images/Kernel_time_histogram.png
+:alt: Kernel Time Histogram
+:figclass: figure
+:align: center
+
+Mapping application kernel launches to execution duration.
+```
 ###### Top Bottleneck Kernels
-![Top Bottleneck Kernels](images/Top_bottleneck_kernels.png)
+``` {figure} images/top-stat_panel.png
+:alt: Top Bottleneck Kernels
+:figclass: figure
+:align: center
+
+Top N kernels and relevant statistics. Sorted by total duration.
+```
 ###### Top Bottleneck Dispatches
-![Top Bottleneck Dispatches](images/Top_bottleneck_dispatches.png)
+``` {figure} images/Top_bottleneck_dispatches.png
+:alt: Top Bottleneck Dispatches
+:figclass: figure
+:align: center
+
+Top N kernel dispatches and relevant statistics. Sorted by total duration.
+```
 ###### Current and Baseline Dispatch IDs (Filtered)
-![Current and Baseline Dispatch IDs](images/Current_and_baseline_dispatch_ids.png)
+``` {figure} images/Current_and_baseline_dispatch_ids.png
+:alt: Current and Baseline Dispatch IDs
+:figclass: figure
+:align: center
+
+List of all kernel dispatches.
+```
 
 ##### System Speed-of-Light
-![System Speed-of-Light](images/System_speed_of_light.png)
+``` {figure} images/sol_panel.png
+:alt: System Speed-of-Light
+:figclass: figure
+:align: center
+
+Key metrics from various sections of Omniperf’s profiling report.
+```
 
 ##### Memory Chart Analysis
 > Note: The Memory Chart Analysis support multiple normalizations. Due to the space limit, all transactions, when normalized to per-sec, default to unit of Billion transactions per second.
 
-![Memory Chart Analysis](images/Memory_chart_analysis.png)
+``` {figure} images/memory-chart_panel.png
+:alt: Memory Chart Analysis
+:figclass: figure
+:align: center
+
+A graphical representation of performance data for memory blocks on the GPU.
+```
+
+##### Empirical Roofline Analysis
+``` {figure} images/roofline_panel.png
+:alt: Roofline Analysis
+:figclass: figure
+:align: center
+
+Visualize achieved performance relative to a benchmarked peak performance.
+```
 
-##### Roofline Analysis
-![Roofline Analysis](images/Roofline_analysis.png)
 ##### Command Processor
-![Command Processor](images/Command_processor.png)
-##### Shader Processing Input (SPI)
-![Shader Processing Input](images/Shader_processing_input.png)
-##### Wavefront Launch
-![Wavefront Launch](images/Wavefront_launch.png)
+###### Command Processor Fetcher
+``` {figure} images/cpc_panel.png
+:alt: Command Processor Fetcher
+:figclass: figure
+:align: center
+
+Fetches commands out of memory to hand them over to the Command Processor Fetcher (CPC) for processing
+```
+###### Command Processor Compute
+``` {figure} images/cpf_panel.png
+:alt: Command Processor Compute
+:figclass: figure
+:align: center
+
+The micro-controller running the command processing firmware that decodes the fetched commands, and (for kernels) passes them to the Workgroup Managers (SPIs) for scheduling.
+```
+
+##### Shader Processor Input (SPI)
+###### SPI Stats
+``` {figure} images/spi-stats_panel.png
+:alt: SPI Stats
+:figclass: figure
+:align: center
+
+TODO: Add caption after merge
+```
+###### SPI Resource Allocation
+``` {figure} images/spi-resource-allocation_panel.png
+:alt: SPI Resource Allocation
+:figclass: figure
+:align: center
+
+TODO: Add caption after merge
+```
+
+##### Wavefront
+###### Wavefront Launch Stats
+``` {figure} images/wavefront-launch-stats_panel.png
+:alt: Wavefront Launch Stats
+:figclass: figure
+:align: center
+
+General information about the kernel launch.
+```
+###### Wavefront Runtime Stats
+``` {figure} images/wavefront-runtime-stats_panel.png
+:alt: Wavefront Runtime Stats
+:figclass: figure
+:align: center
+
+High-level overview of the execution of wavefronts in a kernel.
+```
 
 ##### Compute Unit - Instruction Mix
 ###### Instruction Mix
-![Instruction Mix](images/Instruction_mix.png)
+``` {figure} images/cu-inst-mix_panel.png
+:alt: Instruction Mix
+:figclass: figure
+:align: center
+
+Breakdown of the various types of instructions executed by the user’s kernel, and which pipelines on the Compute Unit (CU) they were executed on.
+```
 ###### VALU Arithmetic Instruction Mix
-![VALU Arithmetic Instruction Mix](images/VALU_arithmetic_instruction_mix.png)
+``` {figure} images/cu-value-arith-instr-mix_panel.png
+:alt: VALU Arithmetic Instruction Mix
+:figclass: figure
+:align: center
+
+The various types of vector instructions that were issued to the vector arithmetic logic unit (VALU).
+```
 ###### MFMA Arithmetic Instruction Mix
-![MFMA Arithmetic Instruction Mix](images/MFMA_arithmetic_instruction_mix.png)
+``` {figure} images/cu-mafma-arith-instr-mix_panel.png
+:alt: MFMA Arithmetic Instruction Mix
+:figclass: figure
+:align: center
+
+The types of Matrix Fused Multiply-Add (MFMA) instructions that were issued.
+```
 ###### VMEM Arithmetic Instruction Mix
-![VMEM Arithmetic Instruction Mix](images/VMEM_arithmetic_intensity_mix.png)
+``` {figure} images/cu-vmem-instr-mix_panel.png
+:alt: VMEM Arithmetic Instruction Mix
+:figclass: figure
+:align: center
+
+The types of vector memory (VMEM) instructions that were issued.
+```
 
 ##### Compute Unit - Compute Pipeline
 ###### Speed-of-Light
-![Speed-of-Light](images/Comp_pipe_sol.png)
-###### Compute Pipeline Stats
-![Compute Pipeline Stats](images/Compute_pipeline_stats.png)
+``` {figure} images/cu-sol_panel.png
+:alt: Speed-of-Light
+:figclass: figure
+:align: center
+
+The number of floating-point and integer operations executed on the vector arithmetic logic unit (VALU) and Matrix Fused Multiply-Add (MFMA) units in various precisions.
+```
+###### Pipeline Stats
+``` {figure} images/cu-pipeline-stats_panel.png
+:alt: Pipeline Stats
+:figclass: figure
+:align: center
+
+More detailed metrics to analyze the several independent pipelines found in the Compute Unit (CU).
+```
 ###### Arithmetic Operations
-![Arithmetic Operations](images/Arithmetic_operations.png)
-###### Memory Latencies
-![Memory Latencies](images/Memory_latencies.png)
+``` {figure} images/cu-arith-ops_panel.png
+:alt: Arithmetic Operations
+:figclass: figure
+:align: center
+
+The total number of floating-point and integer operations executed in various precisions.
+```
 
 ##### Local Data Share (LDS)
 ###### Speed-of-Light
-![Speed-of-Light](images/LDS_sol.png)
+``` {figure} images/lds-sol_panel.png
+:alt: Speed-of-Light
+:figclass: figure
+:align: center
+
+Key metrics for the Local Data Share (LDS) as a comparison with the peak achievable values of those metrics.
+```
 ###### LDS Stats
-![LDS Stats](images/LDS_stats.png)
+``` {figure} images/lds-stats_panel.png
+:alt: LDS Stats
+:figclass: figure
+:align: center
+
+More detailed view of the Local Data Share (LDS) performance.
+```
 
 ##### Instruction Cache
 ###### Speed-of-Light
-![Speed-of-Light](images/Instruc_cache_sol.png)
+``` {figure} images/instr-cache-sol_panel.png
+:alt: Speed-of-Light
+:figclass: figure
+:align: center
+
+Key metrics of the L1 Instruction (L1I) cache as a comparison with the peak achievable values of those metrics.
+```
 ###### Instruction Cache Stats
-![Instruction Cache Stats](images/Instruction_cache_stats.png)
+``` {figure} images/instr-cache-accesses_panel.png
+:alt: Instruction Cache Stats
+:figclass: figure
+:align: center
+
+More detail on the hit/miss statistics of the L1 Instruction (L1I) cache.
+```
 
 ##### Scalar L1D Cache
 ###### Speed-of-Light
-![](images/L1D_sol.png)
-###### Constant Cache Stats
-![Constant Cache Stats](images/Vec_L1D_cache_accesses.png)
-###### Constant Cache - L2 Interface
-![Constant Cache - L2 Interface](images/Constant_cache_l2_interface.png)
+``` {figure} images/sl1d-sol_panel.png
+:alt: Speed-of-Light
+:figclass: figure
+:align: center
+
+Key metrics of the Scalar L1 Data (sL1D) cache as a comparison with the peak achievable values of those metrics.
+```
+###### Scalar L1D Cache Accesses
+``` {figure} images/sl1d-cache-accesses_panel.png
+:alt: Scalar L1D Cache Accesses
+:figclass: figure
+:align: center
+
+More detail on the types of accesses made to the Scalar L1 Data (sL1D) cache, and the hit/miss statistics.
+```
+###### Scalar L1D Cache - L2 Interface
+``` {figure} images/sl1d-l12-interface_panel.png
+:alt: Scalar L1D Cache - L2 Interface
+:figclass: figure
+:align: center
+
+More detail on the data requested across the Scalar L1 Data (sL1D) cache <-> L2 interface.
+```
 
 ##### Texture Address and Texture Data
-###### Texture Address (TA)
-![Texture Address](images/Texture_address.png)
-###### Texture Data (TD)
-![Texture Data](images/Texture_data.png)
+###### Texture Addresser
+``` {figure} images/ta_panel.png
+:alt: Texture Addresser
+:figclass: figure
+:align: center
+
+Metric specific to texture addresser (TA) which receives commands (e.g., instructions) and write/atomic data from the Compute Unit (CU), and coalesces them into fewer requests for the cache to process.
+```
+###### Texture Data
+``` {figure} images/td_panel.png
+:alt: Texture Data
+:figclass: figure
+:align: center
+
+Metrics specific to texture data (TD) which routes data back to the requesting Compute Unit (CU).
+```
 
-##### Vector L1D Cache
+##### Vector L1 Data Cache
 ###### Speed-of-Light
-![Speed-of-Light](images/Vec_L1D_cache_sol.png)
-###### Vector L1D Cache Accesses
-![Vector L1D Cache Accesses](images/Vec_L1D_cache_accesses.png)
-###### L1 Cache Stalls
-![L1 Cache Stalls](images/L1_cache_stalls.png)
-###### L1 - L2 Transactions
-![L1 - L2 Transactions](images/L1_l2_transactions.png)
-###### L1 - UTCL1 Interface Stats
-![L1 - UTCL1 Interface Stats](images/L1_utcl1_transactions.png)
+``` {figure} images/vl1d-sol_panel.png
+:alt: Speed-of-Light
+:figclass: figure
+:align: center
+
+Key metrics of the vector L1 data (vL1D) cache as a comparison with the peak achievable values of those metrics.
+```
+###### L1D Cache Stalls
+``` {figure} images/vl1d-cache-stalls_panel.png
+:alt: L1D Cache Stalls
+:figclass: figure
+:align: center
+
+More detail on where vector L1 data (vL1D) cache is stalled in the pipeline, which may indicate performance limiters of the cache.
+```
+###### L1D Cache Accesses
+``` {figure} images/vl1d-cache-accesses_panel.png
+:alt: L1D Cache Accesses
+:figclass: figure
+:align: center
+
+The type of requests incoming from the cache frontend, the number of requests that were serviced by the vector L1 data (vL1D) cache, and the number & type of outgoing requests to the L2 cache.
+```
+###### L1D - L2 Transactions
+``` {figure} images/vl1d-l2-transactions_panel.png
+:alt: L1D - L2 Transactions
+:figclass: figure
+:align: center
+
+A more granular look at the types of requests made to the L2 cache.
+```
+###### L1D Addr Translation
+``` {figure} images/vl1d-addr-translation_panel.png
+:alt: L1D Addr Translation
+:figclass: figure
+:align: center
+
+After a vector memory instruction has been processed/coalesced by the address processing unit of the vector L1 data (vL1D) cache, it must be translated from a virtual to physical address. These metrics provide more details on the L1 Translation Lookaside Buffer (TLB) which handles this process.
+```
 
 ##### L2 Cache
 ###### Speed-of-Light
-![Speed-of-Light](images/L2_cache_sol.png)
+``` {figure} images/l2-sol_panel.png
+:alt: Speed-of-Light
+:figclass: figure
+:align: center
+
+Key metrics about the performance of the L2 cache, aggregated over all the L2 channels, as a comparison with the peak achievable values of those metrics.
+```
 ###### L2 Cache Accesses
-![L2 Cache Accesses](images/L2_cache_accesses.png)
-###### L2 - EA Transactions
-![L2 - EA Transactions](images/L2_ea_transactions.png)
-###### L2 - EA Stalls
-![L2 - EA Stalls](images/L2_ea_stalls.png)
-
-##### L2 Cache Per Channel Performance
-###### L1-L2 Transactions
-![L1-L2 Transactions](images/L1_l2_transactions_per_channel.png)
-###### L2-EA Transactions
-![L2-EA Transactions](images/L2_ea_transactions_per_channel.png)
-###### L2-EA Latencies
-![L2-EA Latencies](images/L2_ea_latencies_per_channel.png)
-###### L2-EA Stalls
-![L2-EA Stalls](images/L2_ea_stalls_per_channel.png)
-###### L2-EA Write Stalls
-![L2-EA Write Stalls](images/L2_ea_write_stalls_per_channel.png)
-###### L2-EA Write Starvation
-![L2-EA Write Starvation](images/L2_ea_write_starvation_per_channel.png)
\ No newline at end of file
+``` {figure} images/l2-accesses_panel.png
+:alt: L2 Cache Accesses
+:figclass: figure
+:align: center
+
+Incoming requests to the L2 cache from the vector L1 data (vL1D) cache and other clients (e.g., the sL1D and L1I caches).
+```
+###### L2 - Fabric Transactions
+``` {figure} images/l2-fabric-transactions_panel.png
+:alt: L2 - Fabric Transactions
+:figclass: figure
+:align: center
+
+More detail on the flow of requests through Infinity Fabric™.
+```
+###### L2 - Fabric Interface Stalls
+``` {figure} images/l2-fabric-interface-stalls_panel.png
+:alt: L2 - Fabric Interface Stalls
+:figclass: figure
+:align: center
+
+A breakdown of what types of requests in a kernel caused a stall (e.g., read vs write), and to which locations (e.g., to the accelerator’s local memory, or to remote accelerators/CPUs).
+```
+
+##### L2 Cache Per Channel
+###### Aggregate Stats
+``` {figure} images/l2-per-channel-agg-stats_panel.png
+:alt: Aggregate Stats
+:figclass: figure
+:align: center
+
+L2 Cache per channel performance at a glance. Metrics are aggregated over all available channels.
+```
diff --git a/src/docs/conf.py b/src/docs/conf.py
index af0003fb7..f1f26ff80 100644
--- a/src/docs/conf.py
+++ b/src/docs/conf.py
@@ -32,13 +32,13 @@ def install(package):
 # -- Project information -----------------------------------------------------
 
 project = "Omniperf"
-copyright = "2022, Audacious Software Group"
+copyright = "2023-2024, Audacious Software Group"
 author = "Audacious Software Group"
 
 # The short X.Y version
 version = repo_version
 # The full version, including alpha/beta/rc tags
-release = ""
+release = repo_version
 
 # -- General configuration ---------------------------------------------------
 
@@ -52,9 +52,12 @@ def install(package):
     "myst_parser",
 ]
 
-myst_heading_anchors = 2
+show_authors = True
+
+myst_heading_anchors = 4
 # enable replacement of (tm) & friends
-myst_enable_extensions = ["replacements"]
+myst_enable_extensions = ["replacements", "dollarmath"]
+
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
@@ -112,6 +115,10 @@ def install(package):
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ["_static"]
 
+latex_elements = {
+   "sphinxsetup": 'verbatimwrapslines=true, verbatimforcewraps=true',
+}
+
 
 # -- Options for HTMLHelp output ---------------------------------------------
 
@@ -130,7 +137,7 @@ def install(package):
     # Toc options
     "collapse_navigation": True,
     "sticky_navigation": True,
-    "navigation_depth": 4,
+    "navigation_depth": 5,
     "includehidden": True,
     "titles_only": False,
 }
@@ -162,6 +169,7 @@ def setup(app):
     app.add_transform(AutoStructify)
     app.add_config_value("docstring_replacements", {}, True)
     app.connect("source-read", replaceString)
+    app.add_css_file("css/custom.css")
 
 
 # function to replace version string througout documentation
diff --git a/src/docs/faq.md b/src/docs/faq.md
index 6a996cc27..c5450532e 100644
--- a/src/docs/faq.md
+++ b/src/docs/faq.md
@@ -6,7 +6,7 @@
    :maxdepth: 4
 ```
 
-**1. How do I export profiling data I've already generated using Omniperf?**
+**1. How do I export profiling data I have already generated using Omniperf?**
 
 In order to interact with the Grafana GUI you must sync data with the MongoDB backend. This interaction is done through ***database*** mode.
 
@@ -35,11 +35,23 @@ $ export LANG=C.UTF-8
 
 1. Open MobaXterm
 2. In the top ribbon, select `Tunneling`
-![Tunnel Button](images/tunnel_demo1.png)
+``` {image} images/tunnel_demo1.png
+:alt: MobaXterm Tunnel Button
+:class: bg-primary
+:align: center
+```
 This pop up will appear
-![Pop up](images/tunnel_demo2.png)
+``` {image} images/tunnel_demo2.png
+:alt: MobaXterm Pop Up
+:class: bg-primary
+:align: center
+```
 3. Press `New SSH tunnel`
-![Pop up](images/tunnel_demo3.png)
+``` {image} images/tunnel_demo3.png
+:alt: MobaXterm Pop Up
+:class: bg-primary
+:align: center
+```
 4. Configure tunnel accordingly
 
    Local clients
@@ -52,4 +64,4 @@ This pop up will appear
    SSH Server
    - SSH server: Name of the server one is connecting to
    - SSH login: Username to login to the server
-   - SSH port: 22
\ No newline at end of file
+   - SSH port: 22
diff --git a/src/docs/getting_started.md b/src/docs/getting_started.md
index 80ae888f0..b841fb063 100644
--- a/src/docs/getting_started.md
+++ b/src/docs/getting_started.md
@@ -10,13 +10,13 @@
 
 1. **Launch & Profile the target application with the command line profiler**
    
-    The command line profiler launches the target application, calls the rocProfiler API, and collects profile results for the specified kernels, dispatches, and/or IP blocks.  If not specified, Omniperf will default to collecting all available counters for all kernels/dispatches launched by the user's executable.
+    The command line profiler launches the target application, calls the rocProfiler API via the rocProf binary, and collects profile results for the specified kernels, dispatches, and/or hardware components.  If not specified, Omniperf will default to collecting all available counters for all kernels/dispatches launched by the user's executable.
 
     To collect the default set of data for all kernels in the target application, launch, e.g.:
     ```shell
     $ omniperf profile -n vcopy_data -- ./vcopy 1048576 256
     ```
-    The app runs, each kernel is launched, and profiling results are generated. By default, results are written to (e.g.,) ./workloads/vcopy_data (configurable via the `-n` argument). To collect all requested profile information, it may be required to replay kernels multiple times.
+    The app runs, each kernel is launched, and profiling results are generated. By default, results are written to e.g., ./workloads/vcopy_data (configurable via the `-n` argument). To collect all requested profile information, it may be required to replay kernels multiple times.
 
 2. **Customize data collection**
     
@@ -25,19 +25,20 @@
 
     Some common filters include:
 
-    - `-k`/`--kernel` enables filtering kernels by name. `-d`/`--dispatch` enables filtering based on dispatch ID
-    - `-b`/`--ipblocks` enables collects metrics for only the specified (one or more) IP Blocks.
+    - `-k`/`--kernel` enables filtering kernels by name. 
+    - `-d`/`--dispatch` enables filtering based on dispatch ID.
+    - `-b`/`--ipblocks` enables collects metrics for only the specified (one or more) hardware component blocks.
 
-    To view available metrics by IP Block you can use the `--list-metrics` argument to view a list of all available metrics organized by IP Block.
+    To view available metrics by IP Block you can use the `--list-metrics` argument:
     ```shell
     $ omniperf analyze --list-metrics <sys_arch>
     ```
 
 3. **Analyze at the command line**
    
-   After generating a local output folder (./workloads/\<name>), the command line tool can also be used to quickly interface with profiling results. View different metrics derived from your profiled results and get immediate access all metrics organized by IP block.
+   After generating a local output folder (./workloads/\<name>), the command line tool can also be used to quickly interface with profiling results. View different metrics derived from your profiled results and get immediate access all metrics organized by IP blocks.
 
-   If no kernel, dispatch, or ipblock filters are applied at this stage, analysis will be reflective of the entirety of the profiling data.
+   If no kernel, dispatch, or hardware block filters are applied at this stage, analysis will be reflective of the entirety of the profiling data.
 
    To interact with profiling results from a different session, users just provide the workload path.  `-p`/`--path` enables users to analyze existing profiling data in the Omniperf CLI.
 
@@ -55,7 +56,7 @@
 ### Modes
 Modes change the fundamental behavior of the Omniperf command line tool. Depending on which mode is chosen, different command line options become available.
 
-- **Profile**: Target application is launched on the local system utilizing AMD’s [ROC Profiler](https://github.com/ROCm-Developer-Tools/rocprofiler). Depending on the profiling options chosen, selected kernels, dispatches, and/or IP Blocks in the application are profiled and results are stored locally in an output folder (./workloads/\<name>).
+- **Profile**: Target application is launched on the local system using AMD’s [ROC Profiler](https://github.com/ROCm-Developer-Tools/rocprofiler). Depending on the profiling options chosen, selected kernels, dispatches, and/or hardware components in the application are profiled and results are stored locally in an output folder (./workloads/\<name>).
 
     ```shell
     $ omniperf profile --help
@@ -65,7 +66,7 @@ Modes change the fundamental behavior of the Omniperf command line tool. Dependi
 
     To gererate a lightweight GUI interface users can add the `--gui` flag to their analysis command.
 
-    This mode is designed to be a middle ground to the highly detailed Omniperf Grafana GUI and is great for users who want immediate access to an IP Block they’re already familiar with.
+    This mode is designed to be a middle ground to the highly detailed Omniperf Grafana GUI and is great for users who want immediate access to a hardware component they’re already familiar with.
 
     ```shell
     $ omniperf analyze --help
@@ -90,4 +91,4 @@ Standalone roofline analysis | profile | `--name`, `--roof-only`, `-- <profile_c
 Import a workload to database | database | `--import`, `--host`, `--username`, `--workload`, `--team`
 Remove a workload from database | database | `--remove`, `--host`, `--username`, `--workload`, `--team`
 Launch standalone GUI from CLI | analyze | `--path`, `--gui`
-Interact with profiling results from CLI | analyze | `--path`
\ No newline at end of file
+Interact with profiling results from CLI | analyze | `--path`
diff --git a/src/docs/high_level_design.md b/src/docs/high_level_design.md
index fbe33ca3b..1e156296b 100644
--- a/src/docs/high_level_design.md
+++ b/src/docs/high_level_design.md
@@ -8,10 +8,10 @@
 
 The [Omniperf](https://github.com/AMDResearch/omniperf) Tool is architecturally composed of three major components, as shown in the following figure.
 
-- **Omniperf Profiling**: Acquire raw performance counters via application replay based on the [rocProfiler](https://rocm.docs.amd.com/projects/rocprofiler/en/latest/rocprof.html).  The counters are stored in a comma-seperated value, for further analyis. A set of MI200 specific micro benchmarks are also run to acquire the hierarchical roofline data. The roofline model is not available on earlier accelerators.
+- **Omniperf Profiling**: Acquire raw performance counters via application replay based on [rocProf](https://rocm.docs.amd.com/projects/rocprofiler/en/latest/rocprof.html).  The counters are stored in a comma-seperated value, for further analysis. A set of MI200 specific micro benchmarks are also run to acquire the hierarchical roofline data. The roofline model is not available on earlier accelerators.
 
 - **Omniperf Grafana Analyzer**: 
-  - *Grafana database import*: All raw performance counters are imported into the backend MongoDB database for Grafana GUI analysis and visualization. Compatibility of previously generated data between Omniperf versions is not necessarily guarenteed.
+  - *Grafana database import*: All raw performance counters are imported into the backend MongoDB database for Grafana GUI analysis and visualization. Compatibility of previously generated data between Omniperf versions is not necessarily guaranteed.
   - *Grafana GUI Analyzer*: A Grafana dashboard is designed to retrieve the raw counters info from the backend database. It also creates the relevant performance metrics and visualization.
 - **Omniperf Standalone GUI Analyzer**: A standalone GUI is provided to enable performance analysis without importing data into the backend database.
 
diff --git a/src/docs/images/Arithmetic_operations.png b/src/docs/images/Arithmetic_operations.png
deleted file mode 100644
index e1be12851..000000000
Binary files a/src/docs/images/Arithmetic_operations.png and /dev/null differ
diff --git a/src/docs/images/Command_processor.png b/src/docs/images/Command_processor.png
deleted file mode 100644
index bb0342fa4..000000000
Binary files a/src/docs/images/Command_processor.png and /dev/null differ
diff --git a/src/docs/images/Comp_pipe_sol.png b/src/docs/images/Comp_pipe_sol.png
deleted file mode 100644
index 47bb08d9f..000000000
Binary files a/src/docs/images/Comp_pipe_sol.png and /dev/null differ
diff --git a/src/docs/images/Compute_pipeline_stats.png b/src/docs/images/Compute_pipeline_stats.png
deleted file mode 100644
index 95ad3fca8..000000000
Binary files a/src/docs/images/Compute_pipeline_stats.png and /dev/null differ
diff --git a/src/docs/images/Constant_cache_l2_interface.png b/src/docs/images/Constant_cache_l2_interface.png
deleted file mode 100644
index 5e0f5ba8b..000000000
Binary files a/src/docs/images/Constant_cache_l2_interface.png and /dev/null differ
diff --git a/src/docs/images/Constant_cache_stats.png b/src/docs/images/Constant_cache_stats.png
deleted file mode 100644
index 6790cf657..000000000
Binary files a/src/docs/images/Constant_cache_stats.png and /dev/null differ
diff --git a/src/docs/images/Instruc_cache_sol.png b/src/docs/images/Instruc_cache_sol.png
deleted file mode 100644
index ba50e4aaf..000000000
Binary files a/src/docs/images/Instruc_cache_sol.png and /dev/null differ
diff --git a/src/docs/images/Instruction_cache_stats.png b/src/docs/images/Instruction_cache_stats.png
deleted file mode 100644
index fdc9c750c..000000000
Binary files a/src/docs/images/Instruction_cache_stats.png and /dev/null differ
diff --git a/src/docs/images/Instruction_mix.png b/src/docs/images/Instruction_mix.png
deleted file mode 100644
index 22cce1306..000000000
Binary files a/src/docs/images/Instruction_mix.png and /dev/null differ
diff --git a/src/docs/images/L1D_sol.png b/src/docs/images/L1D_sol.png
deleted file mode 100644
index 94999672d..000000000
Binary files a/src/docs/images/L1D_sol.png and /dev/null differ
diff --git a/src/docs/images/L1_cache_stalls.png b/src/docs/images/L1_cache_stalls.png
deleted file mode 100644
index 1c9df98bb..000000000
Binary files a/src/docs/images/L1_cache_stalls.png and /dev/null differ
diff --git a/src/docs/images/L1_l2_transactions.png b/src/docs/images/L1_l2_transactions.png
deleted file mode 100644
index 85b40eb4a..000000000
Binary files a/src/docs/images/L1_l2_transactions.png and /dev/null differ
diff --git a/src/docs/images/L1_utcl1_transactions.png b/src/docs/images/L1_utcl1_transactions.png
deleted file mode 100644
index 91928cf75..000000000
Binary files a/src/docs/images/L1_utcl1_transactions.png and /dev/null differ
diff --git a/src/docs/images/L2_cache_accesses.png b/src/docs/images/L2_cache_accesses.png
deleted file mode 100644
index f6c805938..000000000
Binary files a/src/docs/images/L2_cache_accesses.png and /dev/null differ
diff --git a/src/docs/images/L2_cache_sol.png b/src/docs/images/L2_cache_sol.png
deleted file mode 100644
index 6530f63a6..000000000
Binary files a/src/docs/images/L2_cache_sol.png and /dev/null differ
diff --git a/src/docs/images/L2_ea_stalls.png b/src/docs/images/L2_ea_stalls.png
deleted file mode 100644
index bf7b4a59a..000000000
Binary files a/src/docs/images/L2_ea_stalls.png and /dev/null differ
diff --git a/src/docs/images/L2_ea_transactions.png b/src/docs/images/L2_ea_transactions.png
deleted file mode 100644
index d6da29b62..000000000
Binary files a/src/docs/images/L2_ea_transactions.png and /dev/null differ
diff --git a/src/docs/images/L2_ea_transactions_per_channel.png b/src/docs/images/L2_ea_transactions_per_channel.png
deleted file mode 100644
index 669eb444d..000000000
Binary files a/src/docs/images/L2_ea_transactions_per_channel.png and /dev/null differ
diff --git a/src/docs/images/LDS_sol.png b/src/docs/images/LDS_sol.png
deleted file mode 100644
index b4dfe95df..000000000
Binary files a/src/docs/images/LDS_sol.png and /dev/null differ
diff --git a/src/docs/images/LDS_stats.png b/src/docs/images/LDS_stats.png
deleted file mode 100644
index 426f45f85..000000000
Binary files a/src/docs/images/LDS_stats.png and /dev/null differ
diff --git a/src/docs/images/MFMA_arithmetic_instruction_mix.png b/src/docs/images/MFMA_arithmetic_instruction_mix.png
deleted file mode 100644
index 5bab0f394..000000000
Binary files a/src/docs/images/MFMA_arithmetic_instruction_mix.png and /dev/null differ
diff --git a/src/docs/images/Memory_chart_analysis.png b/src/docs/images/Memory_chart_analysis.png
deleted file mode 100644
index 977733f07..000000000
Binary files a/src/docs/images/Memory_chart_analysis.png and /dev/null differ
diff --git a/src/docs/images/Shader_processing_input.png b/src/docs/images/Shader_processing_input.png
deleted file mode 100644
index 169ffed02..000000000
Binary files a/src/docs/images/Shader_processing_input.png and /dev/null differ
diff --git a/src/docs/images/System_info_panel.png b/src/docs/images/System_info_panel.png
deleted file mode 100644
index 245f05aeb..000000000
Binary files a/src/docs/images/System_info_panel.png and /dev/null differ
diff --git a/src/docs/images/System_speed_of_light.png b/src/docs/images/System_speed_of_light.png
deleted file mode 100644
index d67733127..000000000
Binary files a/src/docs/images/System_speed_of_light.png and /dev/null differ
diff --git a/src/docs/images/Texture_address.png b/src/docs/images/Texture_address.png
deleted file mode 100644
index 7370d3ac2..000000000
Binary files a/src/docs/images/Texture_address.png and /dev/null differ
diff --git a/src/docs/images/Texture_data.png b/src/docs/images/Texture_data.png
deleted file mode 100644
index 3267d6e85..000000000
Binary files a/src/docs/images/Texture_data.png and /dev/null differ
diff --git a/src/docs/images/VALU_arithmetic_instruction_mix.png b/src/docs/images/VALU_arithmetic_instruction_mix.png
deleted file mode 100644
index 0a3597283..000000000
Binary files a/src/docs/images/VALU_arithmetic_instruction_mix.png and /dev/null differ
diff --git a/src/docs/images/VMEM_arithmetic_intensity_mix.png b/src/docs/images/VMEM_arithmetic_intensity_mix.png
deleted file mode 100644
index 112b14da3..000000000
Binary files a/src/docs/images/VMEM_arithmetic_intensity_mix.png and /dev/null differ
diff --git a/src/docs/images/Vec_L1D_cache_accesses.png b/src/docs/images/Vec_L1D_cache_accesses.png
deleted file mode 100644
index 4c5391683..000000000
Binary files a/src/docs/images/Vec_L1D_cache_accesses.png and /dev/null differ
diff --git a/src/docs/images/Vec_L1D_cache_sol.png b/src/docs/images/Vec_L1D_cache_sol.png
deleted file mode 100644
index acec5e1b3..000000000
Binary files a/src/docs/images/Vec_L1D_cache_sol.png and /dev/null differ
diff --git a/src/docs/images/Wavefront_launch.png b/src/docs/images/Wavefront_launch.png
deleted file mode 100644
index d0e587f28..000000000
Binary files a/src/docs/images/Wavefront_launch.png and /dev/null differ
diff --git a/src/docs/images/cpc_panel.png b/src/docs/images/cpc_panel.png
new file mode 100644
index 000000000..7b7f75858
Binary files /dev/null and b/src/docs/images/cpc_panel.png differ
diff --git a/src/docs/images/cpf_panel.png b/src/docs/images/cpf_panel.png
new file mode 100644
index 000000000..a43b87853
Binary files /dev/null and b/src/docs/images/cpf_panel.png differ
diff --git a/src/docs/images/cu-arith-ops_panel.png b/src/docs/images/cu-arith-ops_panel.png
new file mode 100644
index 000000000..073b64d70
Binary files /dev/null and b/src/docs/images/cu-arith-ops_panel.png differ
diff --git a/src/docs/images/cu-inst-mix_panel.png b/src/docs/images/cu-inst-mix_panel.png
new file mode 100644
index 000000000..1b9a6d2b2
Binary files /dev/null and b/src/docs/images/cu-inst-mix_panel.png differ
diff --git a/src/docs/images/cu-mafma-arith-instr-mix_panel.png b/src/docs/images/cu-mafma-arith-instr-mix_panel.png
new file mode 100644
index 000000000..d74dfd271
Binary files /dev/null and b/src/docs/images/cu-mafma-arith-instr-mix_panel.png differ
diff --git a/src/docs/images/cu-pipeline-stats_panel.png b/src/docs/images/cu-pipeline-stats_panel.png
new file mode 100644
index 000000000..6f572f914
Binary files /dev/null and b/src/docs/images/cu-pipeline-stats_panel.png differ
diff --git a/src/docs/images/cu-sol_panel.png b/src/docs/images/cu-sol_panel.png
new file mode 100644
index 000000000..8e8f46174
Binary files /dev/null and b/src/docs/images/cu-sol_panel.png differ
diff --git a/src/docs/images/cu-value-arith-instr-mix_panel.png b/src/docs/images/cu-value-arith-instr-mix_panel.png
new file mode 100644
index 000000000..de3750d2d
Binary files /dev/null and b/src/docs/images/cu-value-arith-instr-mix_panel.png differ
diff --git a/src/docs/images/cu-vmem-instr-mix_panel.png b/src/docs/images/cu-vmem-instr-mix_panel.png
new file mode 100644
index 000000000..1d6ce1bc4
Binary files /dev/null and b/src/docs/images/cu-vmem-instr-mix_panel.png differ
diff --git a/src/docs/images/fabric.png b/src/docs/images/fabric.png
new file mode 100644
index 000000000..826b4d9de
Binary files /dev/null and b/src/docs/images/fabric.png differ
diff --git a/src/docs/images/fabric.svg b/src/docs/images/fabric.svg
new file mode 100644
index 000000000..516854843
--- /dev/null
+++ b/src/docs/images/fabric.svg
@@ -0,0 +1,899 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   width="122.17907mm"
+   height="59.61977mm"
+   viewBox="0 0 122.17907 59.61977"
+   version="1.1"
+   id="svg5"
+   sodipodi:docname="fabric.svg"
+   inkscape:version="1.1.2 (0a00cf5339, 2022-02-04)"
+   inkscape:export-filename="/home/nick/Documents/software_repos/omniperf/src/docs/images/fabric.png"
+   inkscape:export-xdpi="180"
+   inkscape:export-ydpi="180"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <sodipodi:namedview
+     id="namedview142"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:document-units="mm"
+     showgrid="false"
+     inkscape:zoom="2.8284271"
+     inkscape:cx="251.55324"
+     inkscape:cy="153.61895"
+     inkscape:window-width="2490"
+     inkscape:window-height="1376"
+     inkscape:window-x="70"
+     inkscape:window-y="27"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="layer1"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0" />
+  <defs
+     id="defs2">
+    <marker
+       style="overflow:visible"
+       id="marker10337"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path10335" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Send"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.2,0,0,-0.2,-1.2,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path1319" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Mend"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path1313" />
+    </marker>
+    <rect
+       x="83.387405"
+       y="34.737923"
+       width="275.12451"
+       height="59.135487"
+       id="rect97993" />
+    <rect
+       x="428.84567"
+       y="387.25235"
+       width="209.04202"
+       height="89.898453"
+       id="rect64747" />
+    <rect
+       x="396.33554"
+       y="373.98026"
+       width="250.97929"
+       height="100.31375"
+       id="rect58746" />
+    <marker
+       style="overflow:visible"
+       id="Arrow1Lend"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.8,0,0,-0.8,-10,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path20534" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Lstart"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(0.8,0,0,0.8,10,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path20531" />
+    </marker>
+    <rect
+       x="97.144325"
+       y="226.69614"
+       width="148.65651"
+       height="56.321774"
+       id="rect6328" />
+    <rect
+       x="97.144325"
+       y="226.69614"
+       width="148.65651"
+       height="56.321774"
+       id="rect6328-0" />
+    <rect
+       x="97.144325"
+       y="226.69614"
+       width="148.65651"
+       height="56.321774"
+       id="rect6328-0-6" />
+    <rect
+       x="97.144325"
+       y="226.69614"
+       width="148.65651"
+       height="56.321774"
+       id="rect6328-0-3" />
+    <rect
+       x="97.144325"
+       y="226.69614"
+       width="148.65651"
+       height="56.321774"
+       id="rect6328-0-6-3" />
+    <marker
+       style="overflow:visible"
+       id="Arrow1Lend-7"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.8,0,0,-0.8,-10,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path20534-3" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Lend-7-9"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.8,0,0,-0.8,-10,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path20534-3-2" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Lend-7-9-7"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.8,0,0,-0.8,-10,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path20534-3-2-6" />
+    </marker>
+    <rect
+       x="97.144325"
+       y="226.69614"
+       width="148.65651"
+       height="56.321774"
+       id="rect6328-0-6-3-0" />
+    <marker
+       style="overflow:visible"
+       id="Arrow1Lend-7-7"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.8,0,0,-0.8,-10,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path20534-3-0" />
+    </marker>
+    <rect
+       x="97.144325"
+       y="226.69614"
+       width="148.65651"
+       height="56.321774"
+       id="rect6328-0-6-8" />
+    <marker
+       style="overflow:visible"
+       id="Arrow1Lend-7-7-2"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.8,0,0,-0.8,-10,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path20534-3-0-1" />
+    </marker>
+    <rect
+       x="97.144325"
+       y="226.69614"
+       width="148.65651"
+       height="56.321774"
+       id="rect6328-0-6-8-2" />
+    <rect
+       x="83.387405"
+       y="34.737923"
+       width="275.12451"
+       height="59.135487"
+       id="rect97993-7" />
+    <marker
+       style="overflow:visible"
+       id="Arrow1Mend-3"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path1313-5" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Mend-3-2"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path1313-5-9" />
+    </marker>
+    <rect
+       x="97.144325"
+       y="226.69614"
+       width="148.65651"
+       height="56.321774"
+       id="rect6328-0-6-8-2-2" />
+    <rect
+       x="83.387405"
+       y="34.737923"
+       width="275.12451"
+       height="59.135487"
+       id="rect97993-7-7" />
+    <marker
+       style="overflow:visible"
+       id="Arrow1Mend-36"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path1313-7" />
+    </marker>
+    <rect
+       x="97.144325"
+       y="226.69614"
+       width="148.65651"
+       height="56.321774"
+       id="rect6328-0-6-8-2-2-5" />
+    <rect
+       x="83.387405"
+       y="34.737923"
+       width="275.12451"
+       height="59.135487"
+       id="rect97993-7-7-6" />
+    <marker
+       style="overflow:visible"
+       id="Arrow1Mend-36-0"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path1313-7-9" />
+    </marker>
+    <rect
+       x="97.144325"
+       y="226.69614"
+       width="148.65651"
+       height="56.321774"
+       id="rect6328-0-6-8-2-2-5-3" />
+    <rect
+       x="83.387405"
+       y="34.737923"
+       width="275.12451"
+       height="59.135487"
+       id="rect97993-7-7-6-6" />
+    <marker
+       style="overflow:visible"
+       id="Arrow1Mend-3-2-9"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path1313-5-9-2" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Mend-36-0-3"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path1313-7-9-7" />
+    </marker>
+    <rect
+       x="97.144325"
+       y="226.69614"
+       width="148.65651"
+       height="56.321774"
+       id="rect6328-0-6-8-2-2-5-3-3" />
+    <rect
+       x="97.144325"
+       y="226.69614"
+       width="148.65651"
+       height="56.321774"
+       id="rect6328-0-6-8-2-2-5-3-3-6" />
+    <marker
+       style="overflow:visible"
+       id="Arrow1Mend-36-0-2"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path1313-7-9-70" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Mend-36-3"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path1313-7-6" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Mend-367"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path1313-53" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Mend-6"
+       refX="0"
+       refY="0"
+       orient="auto">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path1313-2" />
+    </marker>
+    <rect
+       x="97.144325"
+       y="226.69614"
+       width="148.65651"
+       height="56.321774"
+       id="rect6328-0-6-8-2-2-5-3-6" />
+    <rect
+       x="97.144325"
+       y="226.69614"
+       width="148.65651"
+       height="56.321774"
+       id="rect6328-0-6-8-2-2-5-3-3-1" />
+  </defs>
+  <g
+     id="layer1"
+     transform="translate(-3.1044175,-14.33817)">
+    <g
+       id="g3747"
+       transform="translate(0,0.18150994)">
+      <rect
+         style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.136377;stroke-opacity:1"
+         id="rect31"
+         width="20.273424"
+         height="7.6810331"
+         x="3.172606"
+         y="40.275291" />
+      <text
+         xml:space="preserve"
+         transform="matrix(0.13637766,0,0,0.13637766,-10.07571,10.507238)"
+         id="text6326"
+         style="font-style:normal;font-weight:normal;font-size:16px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect6328);fill:#000000;fill-opacity:1;stroke:none"
+         x="45.007812"
+         y="0"><tspan
+           x="126.48828"
+           y="240.85156"
+           id="tspan43284"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43282">Total Fabric </tspan></tspan><tspan
+           x="134.80859"
+           y="260.85156"
+           id="tspan43288"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43286">Requests</tspan></tspan></text>
+    </g>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.252;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       d="m 23.417828,44.182431 h 7.744823"
+       id="path20529"
+       sodipodi:nodetypes="cc" />
+    <g
+       id="g3739"
+       transform="translate(10.059995,2.72298)">
+      <rect
+         style="opacity:0.28;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.136377;stroke-opacity:1"
+         id="rect31-4"
+         width="20.273424"
+         height="7.6810331"
+         x="31.404562"
+         y="28.797853" />
+      <text
+         xml:space="preserve"
+         transform="matrix(0.13637766,0,0,0.13637766,18.156244,-0.9701983)"
+         id="text6326-8"
+         style="font-style:normal;font-weight:normal;font-size:16px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect6328-0);opacity:0.28;fill:#000000;fill-opacity:1;stroke:none"
+         x="158.11719"
+         y="0"><tspan
+           x="133.16016"
+           y="240.85156"
+           id="tspan43292"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43290">32B Read </tspan></tspan><tspan
+           x="134.80859"
+           y="260.85156"
+           id="tspan43296"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43294">Requests</tspan></tspan></text>
+    </g>
+    <g
+       id="g204"
+       transform="translate(10.059995)">
+      <rect
+         style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.136377;stroke-opacity:1"
+         id="rect31-4-1"
+         width="20.273424"
+         height="7.6810331"
+         x="31.404562"
+         y="14.407581" />
+      <text
+         xml:space="preserve"
+         transform="matrix(0.13637766,0,0,0.13637766,18.156246,-15.360471)"
+         id="text6326-8-2"
+         style="font-style:normal;font-weight:normal;font-size:16px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect6328-0-6);fill:#000000;fill-opacity:1;stroke:none"
+         x="158.11719"
+         y="0"><tspan
+           x="133.16016"
+           y="240.85156"
+           id="tspan43300"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43298">64B Read </tspan></tspan><tspan
+           x="134.80859"
+           y="260.85156"
+           id="tspan43304"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43302">Requests</tspan></tspan></text>
+    </g>
+    <g
+       id="g33085"
+       transform="translate(0,0.26317766)">
+      <rect
+         style="opacity:1;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.136;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+         id="rect31-4-6"
+         width="20.273424"
+         height="7.6810331"
+         x="41.464558"
+         y="48.84304" />
+      <text
+         xml:space="preserve"
+         transform="matrix(0.13637766,0,0,0.13637766,28.216241,19.074987)"
+         id="text6326-8-3"
+         style="font-style:normal;font-weight:normal;font-size:16px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect6328-0-3);opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.997231;stroke-miterlimit:4;stroke-dasharray:none"
+         x="158.11719"
+         y="0"><tspan
+           x="132.14062"
+           y="240.85156"
+           id="tspan43308"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43306">32B Write </tspan></tspan><tspan
+           x="134.80859"
+           y="260.85156"
+           id="tspan43312"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43310">Requests</tspan></tspan></text>
+    </g>
+    <text
+       xml:space="preserve"
+       transform="matrix(0.13637766,0,0,0.13637766,-0.0308419,9.5190479)"
+       id="text58744"
+       style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect58746);fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.997231;stroke-miterlimit:4;stroke-dasharray:none" />
+    <text
+       xml:space="preserve"
+       transform="matrix(0.13637766,0,0,0.13637766,-0.0308419,9.5190479)"
+       id="text64745"
+       style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect64747);fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.997231;stroke-miterlimit:4;stroke-dasharray:none" />
+    <g
+       id="g54852"
+       transform="matrix(0.51544312,0,0,0.51544312,28.426265,12.492382)">
+      <rect
+         style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.264583;stroke-opacity:1"
+         id="rect31-4-1-0"
+         width="39.332031"
+         height="14.901806"
+         x="25.295307"
+         y="104.1685" />
+      <text
+         xml:space="preserve"
+         transform="matrix(0.26458333,0,0,0.26458333,-0.40746419,46.416146)"
+         id="text6326-8-2-2"
+         style="font-style:normal;font-weight:normal;font-size:16px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect6328-0-6-3);fill:#000000;fill-opacity:1;stroke:none"
+         x="158.11719"
+         y="0"><tspan
+           x="132.14062"
+           y="240.85156"
+           id="tspan43316"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43314">64B Write </tspan></tspan><tspan
+           x="134.80859"
+           y="260.85156"
+           id="tspan43320"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43318">Requests</tspan></tspan></text>
+    </g>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.252;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
+       d="M 51.601269,66.186146 V 61.556441"
+       id="path69447-9-7-5-2-3-93-6" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.252;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       d="M 31.07042,44.609772 V 26.783617"
+       id="path69449" />
+    <g
+       id="g6799"
+       transform="translate(-0.85908839)">
+      <rect
+         style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.136377;stroke-opacity:1"
+         id="rect31-4-1-7"
+         width="20.273424"
+         height="7.6810331"
+         x="73.9366"
+         y="14.407581" />
+      <text
+         xml:space="preserve"
+         transform="matrix(0.13637766,0,0,0.13637766,60.688286,-15.360471)"
+         id="text6326-8-2-8"
+         style="font-style:normal;font-weight:normal;font-size:16px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect6328-0-6-8);fill:#000000;fill-opacity:1;stroke:none"
+         x="158.11719"
+         y="0"><tspan
+           x="109.13281"
+           y="240.85156"
+           id="tspan43326"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43322">Uncached </tspan><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43324">Read </tspan></tspan><tspan
+           x="134.80859"
+           y="260.85156"
+           id="tspan43330"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43328">Requests</tspan></tspan></text>
+    </g>
+    <text
+       xml:space="preserve"
+       transform="matrix(0.26458333,0,0,0.26458333,23.273945,0.03952421)"
+       id="text97991"
+       style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect97993);fill:#000000;fill-opacity:1;stroke:none" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:2.64068px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.0660172"
+       x="65.804695"
+       y="17.049156"
+       id="text104993"><tspan
+         id="tspan104991"
+         style="stroke-width:0.0660172"
+         x="65.804695"
+         y="17.049156">x2</tspan></text>
+    <g
+       id="g23519"
+       transform="translate(-8.0320305,-0.27460261)">
+      <g
+         id="g33659">
+        <rect
+           style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.136377;stroke-opacity:1"
+           id="rect31-4-1-7-2"
+           width="20.273424"
+           height="7.6810331"
+           x="81.756866"
+           y="66.459923" />
+        <text
+           xml:space="preserve"
+           transform="matrix(0.13637766,0,0,0.13637766,68.508552,36.707843)"
+           id="text6326-8-2-8-6"
+           style="font-style:normal;font-weight:normal;font-size:16px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect6328-0-6-8-2);fill:#000000;fill-opacity:1;stroke:none"
+           x="158.11719"
+           y="0"><tspan
+             x="108.11328"
+             y="240.85156"
+             id="tspan43336"><tspan
+               style="text-align:center;text-anchor:middle"
+               id="tspan43332">Uncached </tspan><tspan
+               style="text-align:center;text-anchor:middle"
+               id="tspan43334">Write </tspan></tspan><tspan
+             x="134.80859"
+             y="260.85156"
+             id="tspan43340"><tspan
+               style="text-align:center;text-anchor:middle"
+               id="tspan43338">Requests</tspan></tspan></text>
+      </g>
+    </g>
+    <text
+       xml:space="preserve"
+       transform="matrix(0.26458333,0,0,0.26458333,23.273945,52.107839)"
+       id="text97991-3"
+       style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect97993-7);fill:#000000;fill-opacity:1;stroke:none" />
+    <g
+       id="g10025"
+       transform="matrix(0.51544312,0,0,0.51544312,23.359942,44.057639)">
+      <rect
+         style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.264583;stroke-opacity:1"
+         id="rect31-4-1-7-2-9"
+         width="39.332031"
+         height="14.901806"
+         x="81.170959"
+         y="9.7946386" />
+      <text
+         xml:space="preserve"
+         transform="matrix(0.26458333,0,0,0.26458333,55.363782,-47.957714)"
+         id="text6326-8-2-8-6-3"
+         style="font-style:normal;font-weight:normal;font-size:16px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect6328-0-6-8-2-2);fill:#000000;fill-opacity:1;stroke:none"
+         x="158.11719"
+         y="0"><tspan
+           x="141.15234"
+           y="240.85156"
+           id="tspan43344"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43342">Atomic 
+</tspan></tspan><tspan
+           x="134.80859"
+           y="260.85156"
+           id="tspan43348"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43346">Requests</tspan></tspan></text>
+    </g>
+    <text
+       xml:space="preserve"
+       transform="matrix(0.26458333,0,0,0.26458333,23.273945,37.671054)"
+       id="text97991-3-6"
+       style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect97993-7-7);fill:#000000;fill-opacity:1;stroke:none" />
+    <text
+       xml:space="preserve"
+       transform="matrix(0.26458333,0,0,0.26458333,23.273945,66.483549)"
+       id="text97991-3-6-7"
+       style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect97993-7-7-6);fill:#000000;fill-opacity:1;stroke:none" />
+    <g
+       id="g10025-9-6"
+       transform="matrix(0.51544312,0,0,0.51544312,62.851448,9.3577801)">
+      <rect
+         style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.264583;stroke-opacity:1"
+         id="rect31-4-1-7-2-9-1-1"
+         width="39.332031"
+         height="14.901806"
+         x="81.170959"
+         y="9.7946386" />
+      <text
+         xml:space="preserve"
+         transform="matrix(0.26458333,0,0,0.26458333,55.363782,-47.957714)"
+         id="text6326-8-2-8-6-3-2-8"
+         style="font-style:normal;font-weight:normal;font-size:16px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect6328-0-6-8-2-2-5-3);fill:#000000;fill-opacity:1;stroke:none"
+         x="158.11719"
+         y="0"><tspan
+           x="130.42188"
+           y="240.85156"
+           id="tspan43352"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43350">HBM Read
+</tspan></tspan><tspan
+           x="134.80859"
+           y="260.85156"
+           id="tspan43356"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43354">Requests</tspan></tspan></text>
+    </g>
+    <text
+       xml:space="preserve"
+       transform="matrix(0.26458333,0,0,0.26458333,23.273945,14.460151)"
+       id="text97991-3-6-7-7"
+       style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect97993-7-7-6-6);fill:#000000;fill-opacity:1;stroke:none" />
+    <g
+       id="g13641"
+       transform="matrix(0.51544312,0,0,0.51544312,62.851448,26.472254)">
+      <rect
+         style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.264583;stroke-opacity:1"
+         id="rect31-4-1-7-2-9-1-1-7"
+         width="39.332031"
+         height="14.901806"
+         x="81.170959"
+         y="9.7946386" />
+      <text
+         xml:space="preserve"
+         transform="matrix(0.26458333,0,0,0.26458333,55.363782,-47.957714)"
+         id="text6326-8-2-8-6-3-2-8-5"
+         style="font-style:normal;font-weight:normal;font-size:16px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect6328-0-6-8-2-2-5-3-3);fill:#000000;fill-opacity:1;stroke:none"
+         x="158.11719"
+         y="0"><tspan
+           x="117.96094"
+           y="240.85156"
+           id="tspan43360"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43358">Remote Read
+</tspan></tspan><tspan
+           x="134.80859"
+           y="260.85156"
+           id="tspan43364"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43362">Requests</tspan></tspan></text>
+    </g>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.252;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:0.252, 0.252;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow1Mend-36-0)"
+       d="M 61.835501,17.827625 H 72.628293"
+       id="path69449-1-4-5-5-0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.252;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       d="M 31.07042,44.18384 V 61.525736"
+       id="path69449-9" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.251943;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       d="M 30.944463,26.804724 H 114.94704"
+       id="path69449-3" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.252;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
+       d="M 51.601269,28.986028 V 26.930672"
+       id="path69447-9-7-5-2" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.252;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
+       d="m 52.807545,30.383236 -1.17844,-1.410071"
+       id="path69447-9-7-5-6-7" />
+    <ellipse
+       style="fill:#ffffff;stroke:#000000;stroke-width:0.0583236;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       id="path2455-3-6-3-1"
+       cx="-51.593498"
+       cy="28.936687"
+       rx="0.26853767"
+       ry="0.29816741"
+       transform="scale(-1,1)" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.252;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:0.3"
+       d="M 51.601269,31.456218 V 30.329011"
+       id="path69447-9-7-5-2-5" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.252;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
+       d="M 51.601269,26.688316 V 22.058611"
+       id="path69447-9-7-5-2-3" />
+    <ellipse
+       style="fill:#ffffff;stroke:#000000;stroke-width:0.107;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       id="path2455-3-6-3-6"
+       cx="51.61137"
+       cy="28.97942"
+       rx="0.49265727"
+       ry="0.54701579" />
+    <ellipse
+       style="fill:#ffffff;stroke:#000000;stroke-width:0.107;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       id="path2455-3-6-3-6-2"
+       cx="52.914024"
+       cy="30.493517"
+       rx="0.49265727"
+       ry="0.54701579" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.252;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
+       d="M 114.82717,31.453752 V 22.110106"
+       id="path69447-9-7-5-2-3-9" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.252188;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       d="m 30.9454,61.491997 h 84.16591"
+       id="path69449-3-1" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.252;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
+       d="M 51.601269,61.398519 V 56.768814"
+       id="path69447-9-7-5-2-3-93" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.252;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
+       d="M 75.335667,61.434975 V 56.80527"
+       id="path69447-9-7-5-2-3-93-0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.252;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
+       d="M 83.861548,66.11139 V 61.481685"
+       id="path69447-9-7-5-2-3-93-62" />
+    <g
+       id="g10025-9-6-8"
+       transform="matrix(0.51544312,0,0,0.51544312,63.102859,44.045664)">
+      <rect
+         style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.264583;stroke-opacity:1"
+         id="rect31-4-1-7-2-9-1-1-79"
+         width="39.332031"
+         height="14.901806"
+         x="81.170959"
+         y="9.7946386" />
+      <text
+         xml:space="preserve"
+         transform="matrix(0.26458333,0,0,0.26458333,55.363782,-47.957714)"
+         id="text6326-8-2-8-6-3-2-8-2"
+         style="font-style:normal;font-weight:normal;font-size:16px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect6328-0-6-8-2-2-5-3-6);fill:#000000;fill-opacity:1;stroke:none"
+         x="158.11719"
+         y="0"><tspan
+           x="129.40234"
+           y="240.85156"
+           id="tspan43368"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43366">HBM Write </tspan></tspan><tspan
+           x="134.80859"
+           y="260.85156"
+           id="tspan43372"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43370">Requests</tspan></tspan></text>
+    </g>
+    <g
+       id="g13641-0"
+       transform="matrix(0.51544312,0,0,0.51544312,63.102859,61.160138)">
+      <rect
+         style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.264583;stroke-opacity:1"
+         id="rect31-4-1-7-2-9-1-1-7-2"
+         width="39.332031"
+         height="14.901806"
+         x="81.170959"
+         y="9.7946386" />
+      <text
+         xml:space="preserve"
+         transform="matrix(0.26458333,0,0,0.26458333,55.363782,-47.957714)"
+         id="text6326-8-2-8-6-3-2-8-5-3"
+         style="font-style:normal;font-weight:normal;font-size:16px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect6328-0-6-8-2-2-5-3-3-1);fill:#000000;fill-opacity:1;stroke:none"
+         x="158.11719"
+         y="0"><tspan
+           x="116.94141"
+           y="240.85156"
+           id="tspan43376"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43374">Remote Write </tspan></tspan><tspan
+           x="134.80859"
+           y="260.85156"
+           id="tspan43380"><tspan
+             style="text-align:center;text-anchor:middle"
+             id="tspan43378">Requests</tspan></tspan></text>
+    </g>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.252;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
+       d="M 115.07858,66.141636 V 56.79799"
+       id="path69447-9-7-5-2-3-9-7" />
+  </g>
+</svg>
diff --git a/src/docs/images/fig_level_counter.png b/src/docs/images/fig_level_counter.png
new file mode 100755
index 000000000..fa50539a0
Binary files /dev/null and b/src/docs/images/fig_level_counter.png differ
diff --git a/src/docs/images/gcn_compute_unit.png b/src/docs/images/gcn_compute_unit.png
new file mode 100644
index 000000000..e6c1f2eb0
Binary files /dev/null and b/src/docs/images/gcn_compute_unit.png differ
diff --git a/src/docs/images/instr-cache-accesses_panel.png b/src/docs/images/instr-cache-accesses_panel.png
new file mode 100644
index 000000000..926a7805e
Binary files /dev/null and b/src/docs/images/instr-cache-accesses_panel.png differ
diff --git a/src/docs/images/instr-cache-sol_panel.png b/src/docs/images/instr-cache-sol_panel.png
new file mode 100644
index 000000000..64be7178c
Binary files /dev/null and b/src/docs/images/instr-cache-sol_panel.png differ
diff --git a/src/docs/images/l1perf_model.png b/src/docs/images/l1perf_model.png
new file mode 100644
index 000000000..fdabfbb95
Binary files /dev/null and b/src/docs/images/l1perf_model.png differ
diff --git a/src/docs/images/l1perf_model.svg b/src/docs/images/l1perf_model.svg
new file mode 100644
index 000000000..dd22a7131
--- /dev/null
+++ b/src/docs/images/l1perf_model.svg
@@ -0,0 +1,584 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="127.5mm"
+   height="32.5mm"
+   viewBox="0 0 127.5 32.5"
+   version="1.1"
+   id="svg8"
+   inkscape:version="0.92.5 (2060ec1f9f, 2020-04-08)"
+   sodipodi:docname="l1perf_model2.svg"
+   inkscape:export-filename="/home/nick/Documents/software_repos/omniperf/src/docs/images/l1perf_model.png"
+   inkscape:export-xdpi="96"
+   inkscape:export-ydpi="96">
+  <defs
+     id="defs2">
+    <marker
+       inkscape:stockid="Arrow2Mend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow2Mend"
+       style="overflow:visible"
+       inkscape:isstock="true">
+      <path
+         id="path4559"
+         style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         transform="scale(-0.6)"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow2Mstart"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow2Mstart"
+       style="overflow:visible"
+       inkscape:isstock="true">
+      <path
+         id="path4556"
+         style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         transform="scale(0.6)"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow2Lstart"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow2Lstart"
+       style="overflow:visible"
+       inkscape:isstock="true">
+      <path
+         id="path4550"
+         style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         transform="matrix(1.1,0,0,1.1,1.1,0)"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       inkscape:isstock="true"
+       style="overflow:visible"
+       id="marker4982"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow1Lend">
+      <path
+         transform="matrix(-0.8,0,0,-0.8,-10,0)"
+         style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.00000003pt;stroke-opacity:1"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path4980"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow1Lend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow1Lend-3"
+       style="overflow:visible"
+       inkscape:isstock="true"
+       inkscape:collect="always">
+      <path
+         inkscape:connector-curvature="0"
+         id="path4535-6"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.00000003pt;stroke-opacity:1"
+         transform="matrix(-0.8,0,0,-0.8,-10,0)" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow2Lstart"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow2Lstart-8"
+       style="overflow:visible"
+       inkscape:isstock="true">
+      <path
+         id="path4550-1"
+         style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         transform="matrix(1.1,0,0,1.1,1.1,0)"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow2Lstart"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow2Lstart-0"
+       style="overflow:visible"
+       inkscape:isstock="true">
+      <path
+         id="path4550-3"
+         style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         transform="matrix(1.1,0,0,1.1,1.1,0)"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow1Lend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow1Lend-3-4"
+       style="overflow:visible"
+       inkscape:isstock="true"
+       inkscape:collect="always">
+      <path
+         inkscape:connector-curvature="0"
+         id="path4535-6-0"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.00000003pt;stroke-opacity:1"
+         transform="matrix(-0.8,0,0,-0.8,-10,0)" />
+    </marker>
+  </defs>
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="5.6"
+     inkscape:cx="293.09582"
+     inkscape:cy="20.610939"
+     inkscape:document-units="mm"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     inkscape:window-width="2488"
+     inkscape:window-height="1376"
+     inkscape:window-x="72"
+     inkscape:window-y="27"
+     inkscape:window-maximized="1"
+     showguides="true" />
+  <metadata
+     id="metadata5">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title />
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(0,-264.5)">
+    <rect
+       id="rect3717"
+       width="12.756697"
+       height="31.844492"
+       x="0.16001961"
+       y="264.8248"
+       style="fill:none;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.82222223px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="-280.79251"
+       y="7.3169599"
+       id="text4526"
+       transform="rotate(-90)"><tspan
+         sodipodi:role="line"
+         id="tspan1391"
+         x="-280.79251"
+         y="7.3169599">Compute Unit</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="19.342445"
+       y="269.90857"
+       id="text4816"><tspan
+         sodipodi:role="line"
+         x="19.342445"
+         y="269.90857"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.46944451px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.26458332"
+         id="tspan4818">Cmd/Data</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.26753739;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       id="rect4843"
+       width="52.205414"
+       height="31.841953"
+       x="37.515408"
+       y="264.82605"
+       ry="0"
+       rx="0" />
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.26433226;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       id="rect4847"
+       width="12.880525"
+       height="12.567707"
+       x="37.579292"
+       y="264.8248"
+       rx="1.0342027"
+       ry="0.37797582" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:5.28136396px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.13203409"
+       x="44.815056"
+       y="19.832005"
+       id="text4851"
+       transform="scale(1.3075045,0.76481574)"><tspan
+         sodipodi:role="line"
+         x="44.815056"
+         y="24.504774"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.23333311px;line-height:0;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.13203409"
+         id="tspan4857" /></text>
+    <flowRoot
+       xml:space="preserve"
+       id="flowRoot4859"
+       style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none"><flowRegion
+         id="flowRegion4861"><rect
+           id="rect4863"
+           width="71.071426"
+           height="43.57143"
+           x="195.35715"
+           y="38.59111" /></flowRegion><flowPara
+         id="flowPara4865" /></flowRoot>    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666679px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.25103042"
+       x="40.514164"
+       y="264.33435"
+       id="text1823"
+       transform="scale(0.98186128,1.0184738)"><tspan
+         sodipodi:role="line"
+         x="40.514164"
+         y="264.33435"
+         id="tspan1813"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666679px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25103042"><tspan
+           x="40.514164"
+           y="264.33435"
+           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666679px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25103042"
+           id="tspan1811">Address </tspan></tspan><tspan
+         sodipodi:role="line"
+         x="39.07756"
+         y="266.98016"
+         id="tspan1817"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666679px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25103042"><tspan
+           x="39.07756"
+           y="266.98016"
+           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666679px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25103042"
+           id="tspan1815">Processing </tspan></tspan><tspan
+         sodipodi:role="line"
+         x="42.574501"
+         y="269.62601"
+         id="tspan1821"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666679px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25103042"><tspan
+           x="42.574501"
+           y="269.62601"
+           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666679px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25103042"
+           id="tspan1819">Unit</tspan></tspan></text>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.21152194;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#Arrow2Mstart);marker-end:url(#Arrow2Mend)"
+       d="m 43.986979,277.84477 v 5.7019"
+       id="path4904"
+       inkscape:connector-curvature="0" />
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       id="rect4847-2"
+       width="12.945681"
+       height="12.567707"
+       x="37.514137"
+       y="284.10156"
+       rx="1.0394342"
+       ry="0.37797582" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="-283.25629"
+       y="42.902218"
+       id="text5371"
+       transform="rotate(-90)"><tspan
+         sodipodi:role="line"
+         id="tspan5369"
+         x="-283.25629"
+         y="42.902218"
+         style="font-size:2.11666656px;stroke-width:0.26458332">Sync</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:1.25406051px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.2351373"
+       x="41.356129"
+       y="288.49612"
+       id="text940"><tspan
+         sodipodi:role="line"
+         x="41.356129"
+         y="288.49612"
+         id="tspan930"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666656px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.2351373"><tspan
+           x="41.356129"
+           y="288.49612"
+           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666656px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.2351373"
+           id="tspan928">Data </tspan></tspan><tspan
+         sodipodi:role="line"
+         x="38.231773"
+         y="291.14197"
+         id="tspan934"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666656px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.2351373"><tspan
+           x="38.231773"
+           y="291.14197"
+           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666656px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.2351373"
+           id="tspan932">Processing </tspan></tspan><tspan
+         sodipodi:role="line"
+         x="41.728718"
+         y="293.78778"
+         id="tspan938"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666656px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.2351373"><tspan
+           x="41.728718"
+           y="293.78778"
+           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666656px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.2351373"
+           id="tspan936">Unit</tspan></tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       id="rect4847-6"
+       width="12.945681"
+       height="12.567707"
+       x="51.913925"
+       y="264.8251"
+       rx="1.0394342"
+       ry="0.37797582" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666656px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.25566792"
+       x="53.571041"
+       y="267.50827"
+       id="text1809"><tspan
+         sodipodi:role="line"
+         x="53.571041"
+         y="267.50827"
+         id="tspan1795"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666656px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25566792"><tspan
+           x="53.571041"
+           y="267.50827"
+           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666656px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25566792"
+           id="tspan1793">Virtual To </tspan></tspan><tspan
+         sodipodi:role="line"
+         x="54.174622"
+         y="270.15408"
+         id="tspan1799"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666656px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25566792"><tspan
+           x="54.174622"
+           y="270.15408"
+           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666656px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25566792"
+           id="tspan1797">Physical </tspan></tspan><tspan
+         sodipodi:role="line"
+         x="54.265057"
+         y="272.79993"
+         id="tspan1803"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666656px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25566792"><tspan
+           x="54.265057"
+           y="272.79993"
+           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666656px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25566792"
+           id="tspan1801">Address </tspan></tspan><tspan
+         sodipodi:role="line"
+         x="52.713211"
+         y="275.44577"
+         id="tspan1807"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666656px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25566792"><tspan
+           x="52.713211"
+           y="275.44577"
+           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.11666656px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25566792"
+           id="tspan1805">Translation</tspan></tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       id="rect4847-6-4"
+       width="12.945681"
+       height="12.567707"
+       x="66.313728"
+       y="264.8251"
+       rx="1.0394342"
+       ry="0.37797582" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:1.36356223px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.25566792"
+       x="72.785965"
+       y="270.58688"
+       id="text1599"><tspan
+         sodipodi:role="line"
+         id="tspan1825"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.46944451px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;writing-mode:lr-tb;text-anchor:middle"
+         x="73.178444"
+         y="270.58688">Tag </tspan><tspan
+         sodipodi:role="line"
+         id="tspan1827"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.46944451px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;writing-mode:lr-tb;text-anchor:middle"
+         x="72.785965"
+         y="273.67368">RAM</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.25661802;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       id="rect4847-6-4-5"
+       width="27.324272"
+       height="5.5836082"
+       x="51.880585"
+       y="277.95538"
+       rx="2.1939197"
+       ry="0.16792791" />
+    <flowRoot
+       xml:space="preserve"
+       id="flowRoot4896-6-6-6"
+       style="font-style:normal;font-weight:normal;font-size:5.33333349px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none"
+       transform="matrix(0.25566792,0,0,0.25566792,21.981703,13.242276)"><flowRegion
+         id="flowRegion4898-4-2-9"
+         style="font-size:5.33333349px"><rect
+           id="rect4900-0-6-8"
+           width="74.800339"
+           height="17.377857"
+           x="220.71429"
+           y="50.733971"
+           style="font-size:5.33333349px" /></flowRegion><flowPara
+         style="font-size:9.24484825px;text-align:center;text-anchor:middle"
+         id="flowPara5633-2" /></flowRoot>    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:1.36355662px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.25566792"
+       x="59.399857"
+       y="280.12439"
+       id="text914"><tspan
+         sodipodi:role="line"
+         x="59.399857"
+         y="280.12439"
+         id="tspan908"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.46944451px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25566792"><tspan
+           x="59.399857"
+           y="280.12439"
+           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.46944451px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25566792"
+           id="tspan906">L1 Cache </tspan></tspan><tspan
+         sodipodi:role="line"
+         x="59.399857"
+         y="283.21121"
+         id="tspan912"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.46944451px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25566792"><tspan
+           x="59.399857"
+           y="283.21121"
+           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.46944451px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.25566792"
+           id="tspan910">Controller</tspan></tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.25977203;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       id="rect4847-6-4-5-7"
+       width="27.322025"
+       height="12.587595"
+       x="51.882832"
+       y="284.0853"
+       rx="2.1937392"
+       ry="0.37857395" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.46944444px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.25566792;"
+       x="61.700409"
+       y="289.7738"
+       id="text926"><tspan
+         sodipodi:role="line"
+         x="61.700409"
+         y="289.7738"
+         id="tspan920"
+         style="stroke-width:0.25566792;-inkscape-font-specification:'sans-serif, Normal';font-family:sans-serif;font-weight:normal;font-style:normal;font-stretch:normal;font-variant:normal;font-size:2.46944444px;text-anchor:start;text-align:start;writing-mode:lr;font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;"><tspan
+           x="61.700409"
+           y="289.7738"
+           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.46944444px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr;text-anchor:start;stroke-width:0.25566792;"
+           id="tspan916">Cache</tspan><tspan
+           dx="0"
+           x="69.380043"
+           y="289.7738"
+           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.46944444px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr;text-anchor:start;stroke-width:0.25566792;"
+           id="tspan918" /></tspan><tspan
+         sodipodi:role="line"
+         x="62.822392"
+         y="292.8606"
+         id="tspan924"
+         style="stroke-width:0.25566792;-inkscape-font-specification:'sans-serif, Normal';font-family:sans-serif;font-weight:normal;font-style:normal;font-stretch:normal;font-variant:normal;font-size:2.46944444px;text-anchor:start;text-align:start;writing-mode:lr;font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;"><tspan
+           x="62.822392"
+           y="292.8606"
+           style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.46944444px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr;text-anchor:start;stroke-width:0.25566792;"
+           id="tspan922">RAM</tspan></tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.35042927;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       id="rect4847-6-4-5-1"
+       width="8.9711571"
+       height="31.713299"
+       x="80.756233"
+       y="264.86783"
+       rx="0.72031188"
+       ry="0.95378256" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.36361098px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="-280.84842"
+       y="85.92308"
+       id="text4526-5"
+       transform="rotate(-90)"><tspan
+         sodipodi:role="line"
+         id="tspan5786"
+         x="-280.84842"
+         y="85.92308"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.46944451px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;writing-mode:lr-tb;text-anchor:middle">L2 Memory Interface</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="22.10359"
+       y="289.48212"
+       id="text4816-5"><tspan
+         sodipodi:role="line"
+         x="22.10359"
+         y="289.48212"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.46944451px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.26458332"
+         id="tspan4818-8">Data</tspan></text>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.17609379;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow1Lend-3)"
+       d="M 90.208048,280.72448 H 114.26491"
+       id="path4530-75"
+       inkscape:connector-curvature="0" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="99.92643"
+       y="278.60959"
+       id="text4816-3"><tspan
+         sodipodi:role="line"
+         x="99.92643"
+         y="278.60959"
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.46944451px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;writing-mode:lr-tb;text-anchor:start;stroke-width:0.26458332"
+         id="tspan4818-5">Bus</tspan></text>
+    <rect
+       id="rect3717-6"
+       width="12.756697"
+       height="31.844494"
+       x="114.5292"
+       y="264.75391"
+       style="fill:none;stroke:#000000;stroke-width:0.26499999;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.36361122px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="-280.73953"
+       y="121.95969"
+       id="text4526-5-2"
+       transform="rotate(-90)"><tspan
+         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:2.82222223px;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:center;writing-mode:lr-tb;text-anchor:middle;stroke-width:0.26458332"
+         sodipodi:role="line"
+         id="tspan5786-9"
+         x="-280.73953"
+         y="121.95969">L2 Cache</tspan></text>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.17609379;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#Arrow2Lstart-8)"
+       d="m 13.179534,290.78598 h 24.05686"
+       id="path4530-75-1"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.17696008;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1Lend-3-4)"
+       d="M 13.04999,271.40894 H 37.370029"
+       id="path4530-75-3"
+       inkscape:connector-curvature="0" />
+  </g>
+</svg>
diff --git a/src/docs/images/l2-accesses_panel.png b/src/docs/images/l2-accesses_panel.png
new file mode 100644
index 000000000..101cf7753
Binary files /dev/null and b/src/docs/images/l2-accesses_panel.png differ
diff --git a/src/docs/images/l2-fabric-interface-stalls_panel.png b/src/docs/images/l2-fabric-interface-stalls_panel.png
new file mode 100644
index 000000000..b1bd415ca
Binary files /dev/null and b/src/docs/images/l2-fabric-interface-stalls_panel.png differ
diff --git a/src/docs/images/l2-fabric-transactions_panel.png b/src/docs/images/l2-fabric-transactions_panel.png
new file mode 100644
index 000000000..7df5a7809
Binary files /dev/null and b/src/docs/images/l2-fabric-transactions_panel.png differ
diff --git a/src/docs/images/l2-per-channel-agg-stats_panel.png b/src/docs/images/l2-per-channel-agg-stats_panel.png
new file mode 100644
index 000000000..704d45c69
Binary files /dev/null and b/src/docs/images/l2-per-channel-agg-stats_panel.png differ
diff --git a/src/docs/images/l2-sol_panel.png b/src/docs/images/l2-sol_panel.png
new file mode 100644
index 000000000..646e608cb
Binary files /dev/null and b/src/docs/images/l2-sol_panel.png differ
diff --git a/src/docs/images/lds-sol_panel.png b/src/docs/images/lds-sol_panel.png
new file mode 100644
index 000000000..c261513aa
Binary files /dev/null and b/src/docs/images/lds-sol_panel.png differ
diff --git a/src/docs/images/lds-stats_panel.png b/src/docs/images/lds-stats_panel.png
new file mode 100644
index 000000000..0d9d419eb
Binary files /dev/null and b/src/docs/images/lds-stats_panel.png differ
diff --git a/src/docs/images/lds.png b/src/docs/images/lds.png
new file mode 100644
index 000000000..f444eaf53
Binary files /dev/null and b/src/docs/images/lds.png differ
diff --git a/src/docs/images/lds.svg b/src/docs/images/lds.svg
new file mode 100644
index 000000000..c0adb5e91
--- /dev/null
+++ b/src/docs/images/lds.svg
@@ -0,0 +1,393 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   width="78.030128mm"
+   height="59.956924mm"
+   viewBox="0 0 78.030128 59.956924"
+   version="1.1"
+   id="svg5"
+   inkscape:version="1.1.2 (0a00cf5339, 2022-02-04)"
+   sodipodi:docname="lds.svg"
+   inkscape:export-filename="/home/nick/Documents/software_repos/omniperf/src/docs/images/lds.png"
+   inkscape:export-xdpi="180"
+   inkscape:export-ydpi="180"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <sodipodi:namedview
+     id="namedview7"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:document-units="mm"
+     showgrid="false"
+     inkscape:zoom="2.8284271"
+     inkscape:cx="7.9549513"
+     inkscape:cy="93.161318"
+     inkscape:window-width="2490"
+     inkscape:window-height="1376"
+     inkscape:window-x="70"
+     inkscape:window-y="27"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="layer1"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0" />
+  <defs
+     id="defs2">
+    <marker
+       style="overflow:visible"
+       id="marker35467"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow1Mend"
+       inkscape:isstock="true">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path35465" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow2Mend"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow2Mend"
+       inkscape:isstock="true">
+      <path
+         transform="scale(-0.6)"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
+         id="path34770" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="marker35449"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow1Mend"
+       inkscape:isstock="true">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path35447" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Mend"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow1Mend"
+       inkscape:isstock="true">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path34752" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="marker35115"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow2Mstart"
+       inkscape:isstock="true">
+      <path
+         transform="scale(0.6)"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
+         id="path35113" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow2Mstart"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow2Mstart"
+       inkscape:isstock="true">
+      <path
+         transform="scale(0.6)"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
+         id="path34767" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Lstart"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow1Lstart"
+       inkscape:isstock="true">
+      <path
+         transform="matrix(0.8,0,0,0.8,10,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path34743" />
+    </marker>
+    <rect
+       x="175.93893"
+       y="253.99336"
+       width="234.12074"
+       height="100.54605"
+       id="rect3930" />
+    <marker
+       style="overflow:visible"
+       id="Arrow1Mend-5"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow1Mend"
+       inkscape:isstock="true">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path34752-6" />
+    </marker>
+  </defs>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-38.921551,-41.961155)">
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
+       id="rect846"
+       width="77.212677"
+       height="40.499405"
+       x="39.330276"
+       y="61.168674" />
+    <text
+       xml:space="preserve"
+       transform="scale(0.26458333)"
+       id="text3928"
+       style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect3930);fill:#000000;fill-opacity:1;stroke:none" />
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
+       id="rect846-3"
+       width="31.886179"
+       height="13.072078"
+       x="39.171551"
+       y="42.211155" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="47.24173"
+       y="49.892689"
+       id="text25740"><tspan
+         sodipodi:role="line"
+         id="tspan25738"
+         style="font-size:3.52778px;stroke-width:0.264583"
+         x="47.24173"
+         y="49.892689">SIMD 0/1</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="92.864143"
+       y="49.892689"
+       id="text25740-7"><tspan
+         sodipodi:role="line"
+         id="tspan25738-5"
+         style="font-size:3.52778px;stroke-width:0.264583"
+         x="92.864143"
+         y="49.892689">SIMD 2/3</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
+       id="rect846-3-3"
+       width="31.886179"
+       height="13.072078"
+       x="84.815498"
+       y="42.211155" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.239054px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#marker35115);marker-end:url(#Arrow2Mend)"
+       d="m 55.11464,55.888713 v 4.575516"
+       id="path35236" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.239437px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#marker35115);marker-end:url(#Arrow2Mend)"
+       d="m 100.75859,55.863706 v 4.584345"
+       id="path35236-2" />
+    <g
+       id="g91757"
+       transform="translate(0.09337305,-1.490623)">
+      <rect
+         style="fill:none;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
+         id="rect846-3-9"
+         width="57.741043"
+         height="6.0837841"
+         x="49.066093"
+         y="64.83651" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="62.166679"
+         y="69.054901"
+         id="text43239"><tspan
+           sodipodi:role="line"
+           id="tspan43237"
+           style="font-size:3.52778px;stroke-width:0.264583"
+           x="62.166679"
+           y="69.054901">Conflict Detection</tspan></text>
+    </g>
+    <g
+       id="g94249">
+      <rect
+         style="fill:none;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
+         id="rect846-3-9-7"
+         width="57.741043"
+         height="6.0837841"
+         x="49.159466"
+         y="69.429672" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="69.083069"
+         y="73.648064"
+         id="text43239-9"><tspan
+           sodipodi:role="line"
+           id="tspan43237-2"
+           style="font-size:3.52778px;stroke-width:0.264583"
+           x="69.083069"
+           y="73.648064">Scheduler</tspan></text>
+    </g>
+    <g
+       id="g75031">
+      <rect
+         style="fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
+         id="rect53227"
+         width="10.155521"
+         height="20.193951"
+         x="41.817165"
+         y="78.846886" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="-95.113167"
+         y="48.210091"
+         id="text58467"
+         transform="rotate(-90)"><tspan
+           sodipodi:role="line"
+           id="tspan58465"
+           style="font-size:3.52778px;stroke-width:0.264583"
+           x="-95.113167"
+           y="48.210091">Bank 0</tspan></text>
+    </g>
+    <g
+       id="g75036"
+       transform="translate(-0.28890355,0.10812378)">
+      <rect
+         style="fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
+         id="rect53227-1"
+         width="10.155521"
+         height="20.193951"
+         x="55.210358"
+         y="78.738762" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="-95.005043"
+         y="61.603283"
+         id="text58467-2"
+         transform="rotate(-90)"><tspan
+           sodipodi:role="line"
+           id="tspan58465-7"
+           style="font-size:3.52778px;stroke-width:0.264583"
+           x="-95.005043"
+           y="61.603283">Bank 1</tspan></text>
+    </g>
+    <g
+       id="g75041"
+       transform="translate(-0.42640324,0.12826538)">
+      <rect
+         style="fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
+         id="rect53227-0"
+         width="10.155521"
+         height="20.193951"
+         x="68.452148"
+         y="78.71862" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="-94.984901"
+         y="74.845078"
+         id="text58467-9"
+         transform="rotate(-90)"><tspan
+           sodipodi:role="line"
+           id="tspan58465-3"
+           style="font-size:3.52778px;stroke-width:0.264583"
+           x="-94.984901"
+           y="74.845078">Bank 2</tspan></text>
+    </g>
+    <g
+       id="g75046"
+       transform="translate(0,-0.36049652)">
+      <rect
+         style="fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
+         id="rect53227-6"
+         width="10.155521"
+         height="20.193951"
+         x="81.130043"
+         y="79.207382" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="-95.473663"
+         y="87.522972"
+         id="text58467-0"
+         transform="rotate(-90)"><tspan
+           sodipodi:role="line"
+           id="tspan58465-6"
+           style="font-size:3.52778px;stroke-width:0.264583"
+           x="-95.473663"
+           y="87.522972">Bank 3</tspan></text>
+    </g>
+    <g
+       id="g75106"
+       transform="translate(0,0.88236237)">
+      <rect
+         style="fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
+         id="rect53227-2"
+         width="10.155521"
+         height="20.193951"
+         x="103.69362"
+         y="77.964523" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="-95.307404"
+         y="110.08656"
+         id="text58467-6"
+         transform="rotate(-90)"><tspan
+           sodipodi:role="line"
+           id="tspan58465-1"
+           style="font-size:3.52778px;stroke-width:0.264583"
+           x="-95.307404"
+           y="110.08656">Bank 31</tspan></text>
+    </g>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="95.809242"
+       y="89.162628"
+       id="text80928"><tspan
+         sodipodi:role="line"
+         id="tspan80926"
+         style="font-size:3.52778px;stroke-width:0.264583"
+         x="95.809242"
+         y="89.162628">...</tspan></text>
+  </g>
+</svg>
diff --git a/src/docs/images/ldsbandwidth.png b/src/docs/images/ldsbandwidth.png
new file mode 100644
index 000000000..bd74d6249
Binary files /dev/null and b/src/docs/images/ldsbandwidth.png differ
diff --git a/src/docs/images/ldsbandwidth.svg b/src/docs/images/ldsbandwidth.svg
new file mode 100644
index 000000000..a854f697d
--- /dev/null
+++ b/src/docs/images/ldsbandwidth.svg
@@ -0,0 +1,1579 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="460.8pt" height="345.6pt" viewBox="0 0 460.8 345.6" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2023-08-21T11:00:20.650499</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.7.1, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 345.6 
+L 460.8 345.6 
+L 460.8 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 66.08 299.32 
+L 450 299.32 
+L 450 10.8 
+L 66.08 10.8 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="ma9b59badde" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#ma9b59badde" x="83.530909" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- 1 -->
+      <g transform="translate(79.713409 315.438125) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#ma9b59badde" x="169.75893" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- 64 -->
+      <g transform="translate(162.12393 315.438125) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-34" d="M 2419 4116 
+L 825 1625 
+L 2419 1625 
+L 2419 4116 
+z
+M 2253 4666 
+L 3047 4666 
+L 3047 1625 
+L 3713 1625 
+L 3713 1100 
+L 3047 1100 
+L 3047 0 
+L 2419 0 
+L 2419 1100 
+L 313 1100 
+L 313 1709 
+L 2253 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-34" x="63.623047"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#ma9b59badde" x="257.355651" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- 128 -->
+      <g transform="translate(245.903151 315.438125) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-32" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-38" x="127.246094"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#ma9b59badde" x="344.952371" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- 192 -->
+      <g transform="translate(333.499871 315.438125) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-39" d="M 703 97 
+L 703 672 
+Q 941 559 1184 500 
+Q 1428 441 1663 441 
+Q 2288 441 2617 861 
+Q 2947 1281 2994 2138 
+Q 2813 1869 2534 1725 
+Q 2256 1581 1919 1581 
+Q 1219 1581 811 2004 
+Q 403 2428 403 3163 
+Q 403 3881 828 4315 
+Q 1253 4750 1959 4750 
+Q 2769 4750 3195 4129 
+Q 3622 3509 3622 2328 
+Q 3622 1225 3098 567 
+Q 2575 -91 1691 -91 
+Q 1453 -91 1209 -44 
+Q 966 3 703 97 
+z
+M 1959 2075 
+Q 2384 2075 2632 2365 
+Q 2881 2656 2881 3163 
+Q 2881 3666 2632 3958 
+Q 2384 4250 1959 4250 
+Q 1534 4250 1286 3958 
+Q 1038 3666 1038 3163 
+Q 1038 2656 1286 2365 
+Q 1534 2075 1959 2075 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-39" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-32" x="127.246094"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#ma9b59badde" x="432.549091" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- 256 -->
+      <g transform="translate(421.096591 315.438125) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-35" d="M 691 4666 
+L 3169 4666 
+L 3169 4134 
+L 1269 4134 
+L 1269 2991 
+Q 1406 3038 1543 3061 
+Q 1681 3084 1819 3084 
+Q 2600 3084 3056 2656 
+Q 3513 2228 3513 1497 
+Q 3513 744 3044 326 
+Q 2575 -91 1722 -91 
+Q 1428 -91 1123 -41 
+Q 819 9 494 109 
+L 494 744 
+Q 775 591 1075 516 
+Q 1375 441 1709 441 
+Q 2250 441 2565 725 
+Q 2881 1009 2881 1497 
+Q 2881 1984 2565 2268 
+Q 2250 2553 1709 2553 
+Q 1456 2553 1204 2497 
+Q 953 2441 691 2322 
+L 691 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-35" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-36" x="127.246094"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_6">
+     <!-- Kernel Dispatch [N] -->
+     <g transform="translate(188.216875 332.8755) scale(0.144 -0.144)">
+      <defs>
+       <path id="DejaVuSans-4b" d="M 628 4666 
+L 1259 4666 
+L 1259 2694 
+L 3353 4666 
+L 4166 4666 
+L 1850 2491 
+L 4331 0 
+L 3500 0 
+L 1259 2247 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-44" d="M 1259 4147 
+L 1259 519 
+L 2022 519 
+Q 2988 519 3436 956 
+Q 3884 1394 3884 2338 
+Q 3884 3275 3436 3711 
+Q 2988 4147 2022 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 1925 4666 
+Q 3281 4666 3915 4102 
+Q 4550 3538 4550 2338 
+Q 4550 1131 3912 565 
+Q 3275 0 1925 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-5b" d="M 550 4863 
+L 1875 4863 
+L 1875 4416 
+L 1125 4416 
+L 1125 -397 
+L 1875 -397 
+L 1875 -844 
+L 550 -844 
+L 550 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-4e" d="M 628 4666 
+L 1478 4666 
+L 3547 763 
+L 3547 4666 
+L 4159 4666 
+L 4159 0 
+L 3309 0 
+L 1241 3903 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-5d" d="M 1947 4863 
+L 1947 -844 
+L 622 -844 
+L 622 -397 
+L 1369 -397 
+L 1369 4416 
+L 622 4416 
+L 622 4863 
+L 1947 4863 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-4b"/>
+      <use xlink:href="#DejaVuSans-65" x="60.576172"/>
+      <use xlink:href="#DejaVuSans-72" x="122.099609"/>
+      <use xlink:href="#DejaVuSans-6e" x="161.462891"/>
+      <use xlink:href="#DejaVuSans-65" x="224.841797"/>
+      <use xlink:href="#DejaVuSans-6c" x="286.365234"/>
+      <use xlink:href="#DejaVuSans-20" x="314.148438"/>
+      <use xlink:href="#DejaVuSans-44" x="345.935547"/>
+      <use xlink:href="#DejaVuSans-69" x="422.9375"/>
+      <use xlink:href="#DejaVuSans-73" x="450.720703"/>
+      <use xlink:href="#DejaVuSans-70" x="502.820312"/>
+      <use xlink:href="#DejaVuSans-61" x="566.296875"/>
+      <use xlink:href="#DejaVuSans-74" x="627.576172"/>
+      <use xlink:href="#DejaVuSans-63" x="666.785156"/>
+      <use xlink:href="#DejaVuSans-68" x="721.765625"/>
+      <use xlink:href="#DejaVuSans-20" x="785.144531"/>
+      <use xlink:href="#DejaVuSans-5b" x="816.931641"/>
+      <use xlink:href="#DejaVuSans-4e" x="855.945312"/>
+      <use xlink:href="#DejaVuSans-5d" x="930.75"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_6">
+      <defs>
+       <path id="m1fa1725574" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m1fa1725574" x="66.08" y="287.234046" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- 0 -->
+      <g transform="translate(51.445 291.793109) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#m1fa1725574" x="66.08" y="235.804456" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- 200 -->
+      <g transform="translate(36.175 240.363519) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-30" x="127.246094"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_8">
+      <g>
+       <use xlink:href="#m1fa1725574" x="66.08" y="184.374866" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- 400 -->
+      <g transform="translate(36.175 188.933929) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-30" x="127.246094"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_9">
+      <g>
+       <use xlink:href="#m1fa1725574" x="66.08" y="132.945276" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_10">
+      <!-- 600 -->
+      <g transform="translate(36.175 137.504339) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-30" x="127.246094"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="line2d_10">
+      <g>
+       <use xlink:href="#m1fa1725574" x="66.08" y="81.515686" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_11">
+      <!-- 800 -->
+      <g transform="translate(36.175 86.074749) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-38"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-30" x="127.246094"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_11">
+      <g>
+       <use xlink:href="#m1fa1725574" x="66.08" y="30.086096" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_12">
+      <!-- 1000 -->
+      <g transform="translate(28.54 34.645159) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-30" x="127.246094"/>
+       <use xlink:href="#DejaVuSans-30" x="190.869141"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_13">
+     <!-- Bandwidth [B] -->
+     <g transform="translate(21.54525 206.237375) rotate(-90) scale(0.144 -0.144)">
+      <defs>
+       <path id="DejaVuSans-42" d="M 1259 2228 
+L 1259 519 
+L 2272 519 
+Q 2781 519 3026 730 
+Q 3272 941 3272 1375 
+Q 3272 1813 3026 2020 
+Q 2781 2228 2272 2228 
+L 1259 2228 
+z
+M 1259 4147 
+L 1259 2741 
+L 2194 2741 
+Q 2656 2741 2882 2914 
+Q 3109 3088 3109 3444 
+Q 3109 3797 2882 3972 
+Q 2656 4147 2194 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2241 4666 
+Q 2963 4666 3353 4366 
+Q 3744 4066 3744 3513 
+Q 3744 3084 3544 2831 
+Q 3344 2578 2956 2516 
+Q 3422 2416 3680 2098 
+Q 3938 1781 3938 1306 
+Q 3938 681 3513 340 
+Q 3088 0 2303 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-64" d="M 2906 2969 
+L 2906 4863 
+L 3481 4863 
+L 3481 0 
+L 2906 0 
+L 2906 525 
+Q 2725 213 2448 61 
+Q 2172 -91 1784 -91 
+Q 1150 -91 751 415 
+Q 353 922 353 1747 
+Q 353 2572 751 3078 
+Q 1150 3584 1784 3584 
+Q 2172 3584 2448 3432 
+Q 2725 3281 2906 2969 
+z
+M 947 1747 
+Q 947 1113 1208 752 
+Q 1469 391 1925 391 
+Q 2381 391 2643 752 
+Q 2906 1113 2906 1747 
+Q 2906 2381 2643 2742 
+Q 2381 3103 1925 3103 
+Q 1469 3103 1208 2742 
+Q 947 2381 947 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-77" d="M 269 3500 
+L 844 3500 
+L 1563 769 
+L 2278 3500 
+L 2956 3500 
+L 3675 769 
+L 4391 3500 
+L 4966 3500 
+L 4050 0 
+L 3372 0 
+L 2619 2869 
+L 1863 0 
+L 1184 0 
+L 269 3500 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-42"/>
+      <use xlink:href="#DejaVuSans-61" x="68.603516"/>
+      <use xlink:href="#DejaVuSans-6e" x="129.882812"/>
+      <use xlink:href="#DejaVuSans-64" x="193.261719"/>
+      <use xlink:href="#DejaVuSans-77" x="256.738281"/>
+      <use xlink:href="#DejaVuSans-69" x="338.525391"/>
+      <use xlink:href="#DejaVuSans-64" x="366.308594"/>
+      <use xlink:href="#DejaVuSans-74" x="429.785156"/>
+      <use xlink:href="#DejaVuSans-68" x="468.994141"/>
+      <use xlink:href="#DejaVuSans-20" x="532.373047"/>
+      <use xlink:href="#DejaVuSans-5b" x="564.160156"/>
+      <use xlink:href="#DejaVuSans-42" x="603.173828"/>
+      <use xlink:href="#DejaVuSans-5d" x="671.777344"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_12">
+    <defs>
+     <path id="m6676b56e61" d="M 0 2 
+C 0.530406 2 1.03916 1.789267 1.414214 1.414214 
+C 1.789267 1.03916 2 0.530406 2 0 
+C 2 -0.530406 1.789267 -1.03916 1.414214 -1.414214 
+C 1.03916 -1.789267 0.530406 -2 0 -2 
+C -0.530406 -2 -1.03916 -1.789267 -1.414214 -1.414214 
+C -1.789267 -1.03916 -2 -0.530406 -2 0 
+C -2 0.530406 -1.789267 1.03916 -1.414214 1.414214 
+C -1.03916 1.789267 -0.530406 2 0 2 
+z
+" style="stroke: #440154"/>
+    </defs>
+    <g clip-path="url(#p8aa07be5e1)">
+     <use xlink:href="#m6676b56e61" x="83.530909" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="84.899608" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="86.268307" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="87.637005" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="89.005704" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="90.374403" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="91.743102" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="93.1118" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="94.480499" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="95.849198" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="97.217897" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="98.586595" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="99.955294" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="101.323993" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="102.692692" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="104.06139" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="105.430089" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="106.798788" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="108.167487" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="109.536185" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="110.904884" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="112.273583" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="113.642282" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="115.01098" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="116.379679" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="117.748378" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="119.117077" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="120.485775" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="121.854474" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="123.223173" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="124.591872" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="125.96057" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="127.329269" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="128.697968" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="130.066667" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="131.435365" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="132.804064" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="134.172763" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="135.541462" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="136.91016" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="138.278859" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="139.647558" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="141.016257" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="142.384955" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="143.753654" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="145.122353" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="146.491052" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="147.85975" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="149.228449" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="150.597148" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="151.965847" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="153.334545" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="154.703244" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="156.071943" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="157.440642" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="158.80934" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="160.178039" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="161.546738" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="162.915437" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="164.284135" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="165.652834" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="167.021533" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="168.390232" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="169.75893" y="221.404171" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="171.127629" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="172.496328" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="173.865027" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="175.233725" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="176.602424" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="177.971123" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="179.339822" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="180.70852" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="182.077219" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="183.445918" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="184.814617" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="186.183316" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="187.552014" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="188.920713" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="190.289412" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="191.658111" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="193.026809" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="194.395508" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="195.764207" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="197.132906" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="198.501604" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="199.870303" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="201.239002" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="202.607701" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="203.976399" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="205.345098" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="206.713797" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="208.082496" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="209.451194" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="210.819893" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="212.188592" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="213.557291" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="214.925989" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="216.294688" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="217.663387" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="219.032086" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="220.400784" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="221.769483" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="223.138182" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="224.506881" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="225.875579" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="227.244278" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="228.612977" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="229.981676" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="231.350374" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="232.719073" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="234.087772" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="235.456471" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="236.825169" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="238.193868" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="239.562567" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="240.931266" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="242.299964" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="243.668663" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="245.037362" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="246.406061" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="247.774759" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="249.143458" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="250.512157" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="251.880856" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="253.249554" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="254.618253" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="255.986952" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="257.355651" y="155.574296" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="258.724349" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="260.093048" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="261.461747" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="262.830446" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="264.199144" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="265.567843" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="266.936542" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="268.305241" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="269.673939" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="271.042638" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="272.411337" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="273.780036" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="275.148734" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="276.517433" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="277.886132" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="279.254831" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="280.623529" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="281.992228" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="283.360927" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="284.729626" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="286.098324" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="287.467023" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="288.835722" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="290.204421" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="291.573119" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="292.941818" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="294.310517" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="295.679216" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="297.047914" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="298.416613" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="299.785312" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="301.154011" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="302.522709" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="303.891408" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="305.260107" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="306.628806" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="307.997504" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="309.366203" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="310.734902" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="312.103601" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="313.472299" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="314.840998" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="316.209697" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="317.578396" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="318.947094" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="320.315793" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="321.684492" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="323.053191" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="324.421889" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="325.790588" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="327.159287" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="328.527986" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="329.896684" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="331.265383" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="332.634082" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="334.002781" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="335.37148" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="336.740178" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="338.108877" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="339.477576" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="340.846275" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="342.214973" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="343.583672" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="344.952371" y="89.744421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="346.32107" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="347.689768" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="349.058467" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="350.427166" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="351.795865" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="353.164563" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="354.533262" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="355.901961" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="357.27066" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="358.639358" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="360.008057" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="361.376756" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="362.745455" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="364.114153" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="365.482852" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="366.851551" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="368.22025" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="369.588948" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="370.957647" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="372.326346" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="373.695045" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="375.063743" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="376.432442" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="377.801141" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="379.16984" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="380.538538" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="381.907237" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="383.275936" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="384.644635" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="386.013333" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="387.382032" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="388.750731" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="390.11943" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="391.488128" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="392.856827" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="394.225526" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="395.594225" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="396.962923" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="398.331622" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="399.700321" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="401.06902" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="402.437718" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="403.806417" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="405.175116" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="406.543815" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="407.912513" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="409.281212" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="410.649911" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="412.01861" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="413.387308" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="414.756007" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="416.124706" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="417.493405" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="418.862103" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="420.230802" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="421.599501" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="422.9682" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="424.336898" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="425.705597" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="427.074296" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="428.442995" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="429.811693" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="431.180392" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#m6676b56e61" x="432.549091" y="23.914545" style="fill-opacity: 0; stroke: #440154"/>
+    </g>
+   </g>
+   <g id="line2d_13">
+    <defs>
+     <path id="mb17681182d" d="M -5.5 -0 
+L 5.5 5.5 
+L 5.5 -5.5 
+z
+" style="stroke: #21918c; stroke-linejoin: miter"/>
+    </defs>
+    <g clip-path="url(#p8aa07be5e1)">
+     <use xlink:href="#mb17681182d" x="83.530909" y="286.205455" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="84.899608" y="285.176863" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="86.268307" y="284.148271" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="87.637005" y="283.119679" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="89.005704" y="282.091087" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="90.374403" y="281.062496" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="91.743102" y="280.033904" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="93.1118" y="279.005312" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="94.480499" y="277.97672" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="95.849198" y="276.948128" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="97.217897" y="275.919537" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="98.586595" y="274.890945" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="99.955294" y="273.862353" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="101.323993" y="272.833761" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="102.692692" y="271.805169" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="104.06139" y="270.776578" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="105.430089" y="269.747986" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="106.798788" y="268.719394" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="108.167487" y="267.690802" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="109.536185" y="266.66221" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="110.904884" y="265.633619" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="112.273583" y="264.605027" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="113.642282" y="263.576435" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="115.01098" y="262.547843" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="116.379679" y="261.519251" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="117.748378" y="260.49066" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="119.117077" y="259.462068" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="120.485775" y="258.433476" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="121.854474" y="257.404884" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="123.223173" y="256.376292" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="124.591872" y="255.347701" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="125.96057" y="254.319109" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="127.329269" y="253.290517" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="128.697968" y="252.261925" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="130.066667" y="251.233333" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="131.435365" y="250.204742" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="132.804064" y="249.17615" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="134.172763" y="248.147558" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="135.541462" y="247.118966" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="136.91016" y="246.090374" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="138.278859" y="245.061783" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="139.647558" y="244.033191" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="141.016257" y="243.004599" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="142.384955" y="241.976007" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="143.753654" y="240.947415" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="145.122353" y="239.918824" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="146.491052" y="238.890232" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="147.85975" y="237.86164" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="149.228449" y="236.833048" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="150.597148" y="235.804456" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="151.965847" y="234.775865" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="153.334545" y="233.747273" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="154.703244" y="232.718681" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="156.071943" y="231.690089" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="157.440642" y="230.661497" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="158.80934" y="229.632906" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="160.178039" y="228.604314" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="161.546738" y="227.575722" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="162.915437" y="226.54713" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="164.284135" y="225.518538" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="165.652834" y="224.489947" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="167.021533" y="223.461355" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="168.390232" y="222.432763" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="169.75893" y="221.404171" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="171.127629" y="220.375579" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="172.496328" y="219.346988" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="173.865027" y="218.318396" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="175.233725" y="217.289804" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="176.602424" y="216.261212" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="177.971123" y="215.23262" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="179.339822" y="214.204029" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="180.70852" y="213.175437" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="182.077219" y="212.146845" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="183.445918" y="211.118253" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="184.814617" y="210.089661" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="186.183316" y="209.06107" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="187.552014" y="208.032478" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="188.920713" y="207.003886" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="190.289412" y="205.975294" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="191.658111" y="204.946702" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="193.026809" y="203.918111" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="194.395508" y="202.889519" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="195.764207" y="201.860927" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="197.132906" y="200.832335" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="198.501604" y="199.803743" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="199.870303" y="198.775152" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="201.239002" y="197.74656" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="202.607701" y="196.717968" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="203.976399" y="195.689376" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="205.345098" y="194.660784" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="206.713797" y="193.632193" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="208.082496" y="192.603601" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="209.451194" y="191.575009" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="210.819893" y="190.546417" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="212.188592" y="189.517825" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="213.557291" y="188.489234" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="214.925989" y="187.460642" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="216.294688" y="186.43205" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="217.663387" y="185.403458" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="219.032086" y="184.374866" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="220.400784" y="183.346275" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="221.769483" y="182.317683" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="223.138182" y="181.289091" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="224.506881" y="180.260499" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="225.875579" y="179.231907" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="227.244278" y="178.203316" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="228.612977" y="177.174724" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="229.981676" y="176.146132" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="231.350374" y="175.11754" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="232.719073" y="174.088948" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="234.087772" y="173.060357" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="235.456471" y="172.031765" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="236.825169" y="171.003173" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="238.193868" y="169.974581" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="239.562567" y="168.945989" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="240.931266" y="167.917398" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="242.299964" y="166.888806" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="243.668663" y="165.860214" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="245.037362" y="164.831622" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="246.406061" y="163.80303" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="247.774759" y="162.774439" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="249.143458" y="161.745847" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="250.512157" y="160.717255" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="251.880856" y="159.688663" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="253.249554" y="158.660071" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="254.618253" y="157.63148" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="255.986952" y="156.602888" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="257.355651" y="155.574296" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="258.724349" y="154.545704" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="260.093048" y="153.517112" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="261.461747" y="152.48852" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="262.830446" y="151.459929" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="264.199144" y="150.431337" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="265.567843" y="149.402745" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="266.936542" y="148.374153" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="268.305241" y="147.345561" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="269.673939" y="146.31697" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="271.042638" y="145.288378" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="272.411337" y="144.259786" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="273.780036" y="143.231194" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="275.148734" y="142.202602" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="276.517433" y="141.174011" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="277.886132" y="140.145419" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="279.254831" y="139.116827" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="280.623529" y="138.088235" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="281.992228" y="137.059643" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="283.360927" y="136.031052" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="284.729626" y="135.00246" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="286.098324" y="133.973868" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="287.467023" y="132.945276" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="288.835722" y="131.916684" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="290.204421" y="130.888093" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="291.573119" y="129.859501" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="292.941818" y="128.830909" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="294.310517" y="127.802317" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="295.679216" y="126.773725" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="297.047914" y="125.745134" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="298.416613" y="124.716542" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="299.785312" y="123.68795" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="301.154011" y="122.659358" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="302.522709" y="121.630766" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="303.891408" y="120.602175" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="305.260107" y="119.573583" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="306.628806" y="118.544991" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="307.997504" y="117.516399" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="309.366203" y="116.487807" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="310.734902" y="115.459216" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="312.103601" y="114.430624" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="313.472299" y="113.402032" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="314.840998" y="112.37344" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="316.209697" y="111.344848" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="317.578396" y="110.316257" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="318.947094" y="109.287665" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="320.315793" y="108.259073" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="321.684492" y="107.230481" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="323.053191" y="106.201889" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="324.421889" y="105.173298" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="325.790588" y="104.144706" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="327.159287" y="103.116114" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="328.527986" y="102.087522" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="329.896684" y="101.05893" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="331.265383" y="100.030339" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="332.634082" y="99.001747" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="334.002781" y="97.973155" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="335.37148" y="96.944563" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="336.740178" y="95.915971" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="338.108877" y="94.88738" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="339.477576" y="93.858788" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="340.846275" y="92.830196" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="342.214973" y="91.801604" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="343.583672" y="90.773012" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="344.952371" y="89.744421" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="346.32107" y="88.715829" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="347.689768" y="87.687237" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="349.058467" y="86.658645" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="350.427166" y="85.630053" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="351.795865" y="84.601462" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="353.164563" y="83.57287" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="354.533262" y="82.544278" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="355.901961" y="81.515686" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="357.27066" y="80.487094" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="358.639358" y="79.458503" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="360.008057" y="78.429911" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="361.376756" y="77.401319" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="362.745455" y="76.372727" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="364.114153" y="75.344135" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="365.482852" y="74.315544" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="366.851551" y="73.286952" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="368.22025" y="72.25836" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="369.588948" y="71.229768" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="370.957647" y="70.201176" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="372.326346" y="69.172585" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="373.695045" y="68.143993" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="375.063743" y="67.115401" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="376.432442" y="66.086809" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="377.801141" y="65.058217" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="379.16984" y="64.029626" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="380.538538" y="63.001034" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="381.907237" y="61.972442" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="383.275936" y="60.94385" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="384.644635" y="59.915258" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="386.013333" y="58.886667" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="387.382032" y="57.858075" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="388.750731" y="56.829483" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="390.11943" y="55.800891" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="391.488128" y="54.772299" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="392.856827" y="53.743708" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="394.225526" y="52.715116" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="395.594225" y="51.686524" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="396.962923" y="50.657932" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="398.331622" y="49.62934" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="399.700321" y="48.600749" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="401.06902" y="47.572157" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="402.437718" y="46.543565" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="403.806417" y="45.514973" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="405.175116" y="44.486381" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="406.543815" y="43.45779" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="407.912513" y="42.429198" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="409.281212" y="41.400606" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="410.649911" y="40.372014" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="412.01861" y="39.343422" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="413.387308" y="38.314831" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="414.756007" y="37.286239" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="416.124706" y="36.257647" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="417.493405" y="35.229055" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="418.862103" y="34.200463" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="420.230802" y="33.171872" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="421.599501" y="32.14328" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="422.9682" y="31.114688" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="424.336898" y="30.086096" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="425.705597" y="29.057504" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="427.074296" y="28.028913" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="428.442995" y="27.000321" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="429.811693" y="25.971729" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="431.180392" y="24.943137" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mb17681182d" x="432.549091" y="23.914545" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+    </g>
+   </g>
+   <g id="line2d_14">
+    <path d="M 432.549091 299.32 
+L 432.549091 10.8 
+" clip-path="url(#p8aa07be5e1)" style="fill: none; stroke-dasharray: 5.55,2.4; stroke-dashoffset: 0; stroke: #000000; stroke-width: 1.5"/>
+   </g>
+   <g id="line2d_15">
+    <path d="M 344.952371 299.32 
+L 344.952371 10.8 
+" clip-path="url(#p8aa07be5e1)" style="fill: none; stroke-dasharray: 5.55,2.4; stroke-dashoffset: 0; stroke: #000000; stroke-width: 1.5"/>
+   </g>
+   <g id="line2d_16">
+    <path d="M 257.355651 299.32 
+L 257.355651 10.8 
+" clip-path="url(#p8aa07be5e1)" style="fill: none; stroke-dasharray: 5.55,2.4; stroke-dashoffset: 0; stroke: #000000; stroke-width: 1.5"/>
+   </g>
+   <g id="line2d_17">
+    <path d="M 169.75893 299.32 
+L 169.75893 10.8 
+" clip-path="url(#p8aa07be5e1)" style="fill: none; stroke-dasharray: 5.55,2.4; stroke-dashoffset: 0; stroke: #000000; stroke-width: 1.5"/>
+   </g>
+   <g id="line2d_18">
+    <path d="M 83.530909 299.32 
+L 83.530909 10.8 
+" clip-path="url(#p8aa07be5e1)" style="fill: none; stroke-dasharray: 5.55,2.4; stroke-dashoffset: 0; stroke: #000000; stroke-width: 1.5"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 66.08 299.32 
+L 66.08 10.8 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 450 299.32 
+L 450 10.8 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 66.08 299.32 
+L 450 299.32 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 66.08 10.8 
+L 450 10.8 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="legend_1">
+    <g id="patch_7">
+     <path d="M 201.82125 294.32 
+L 314.25875 294.32 
+Q 316.25875 294.32 316.25875 292.32 
+L 316.25875 255.447 
+Q 316.25875 253.447 314.25875 253.447 
+L 201.82125 253.447 
+Q 199.82125 253.447 199.82125 255.447 
+L 199.82125 292.32 
+Q 199.82125 294.32 201.82125 294.32 
+z
+" style="fill: #ffffff; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter"/>
+    </g>
+    <g id="line2d_19">
+     <g>
+      <use xlink:href="#m6676b56e61" x="213.82125" y="264.88875" style="fill-opacity: 0; stroke: #440154"/>
+     </g>
+    </g>
+    <g id="text_14">
+     <!-- Theoretical -->
+     <g transform="translate(231.82125 268.38875) scale(0.144 -0.144)">
+      <defs>
+       <path id="DejaVuSans-54" d="M -19 4666 
+L 3928 4666 
+L 3928 4134 
+L 2272 4134 
+L 2272 0 
+L 1638 0 
+L 1638 4134 
+L -19 4134 
+L -19 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-54"/>
+      <use xlink:href="#DejaVuSans-68" x="61.083984"/>
+      <use xlink:href="#DejaVuSans-65" x="124.462891"/>
+      <use xlink:href="#DejaVuSans-6f" x="185.986328"/>
+      <use xlink:href="#DejaVuSans-72" x="247.167969"/>
+      <use xlink:href="#DejaVuSans-65" x="286.03125"/>
+      <use xlink:href="#DejaVuSans-74" x="347.554688"/>
+      <use xlink:href="#DejaVuSans-69" x="386.763672"/>
+      <use xlink:href="#DejaVuSans-63" x="414.546875"/>
+      <use xlink:href="#DejaVuSans-61" x="469.527344"/>
+      <use xlink:href="#DejaVuSans-6c" x="530.806641"/>
+     </g>
+    </g>
+    <g id="line2d_20">
+     <g>
+      <use xlink:href="#mb17681182d" x="213.82125" y="283.82525" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     </g>
+    </g>
+    <g id="text_15">
+     <!-- Effective -->
+     <g transform="translate(231.82125 287.32525) scale(0.144 -0.144)">
+      <defs>
+       <path id="DejaVuSans-45" d="M 628 4666 
+L 3578 4666 
+L 3578 4134 
+L 1259 4134 
+L 1259 2753 
+L 3481 2753 
+L 3481 2222 
+L 1259 2222 
+L 1259 531 
+L 3634 531 
+L 3634 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-76" d="M 191 3500 
+L 800 3500 
+L 1894 563 
+L 2988 3500 
+L 3597 3500 
+L 2284 0 
+L 1503 0 
+L 191 3500 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-45"/>
+      <use xlink:href="#DejaVuSans-66" x="63.183594"/>
+      <use xlink:href="#DejaVuSans-66" x="98.388672"/>
+      <use xlink:href="#DejaVuSans-65" x="133.59375"/>
+      <use xlink:href="#DejaVuSans-63" x="195.117188"/>
+      <use xlink:href="#DejaVuSans-74" x="250.097656"/>
+      <use xlink:href="#DejaVuSans-69" x="289.306641"/>
+      <use xlink:href="#DejaVuSans-76" x="317.089844"/>
+      <use xlink:href="#DejaVuSans-65" x="376.269531"/>
+     </g>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p8aa07be5e1">
+   <rect x="66.08" y="10.8" width="383.92" height="288.52"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/src/docs/images/ldsconflictrate.png b/src/docs/images/ldsconflictrate.png
new file mode 100644
index 000000000..ab057f3cd
Binary files /dev/null and b/src/docs/images/ldsconflictrate.png differ
diff --git a/src/docs/images/ldsconflictrate.svg b/src/docs/images/ldsconflictrate.svg
new file mode 100644
index 000000000..f98e9bc4a
--- /dev/null
+++ b/src/docs/images/ldsconflictrate.svg
@@ -0,0 +1,1050 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="460.8pt" height="345.6pt" viewBox="0 0 460.8 345.6" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2023-08-21T11:43:04.336525</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.7.1, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 345.6 
+L 460.8 345.6 
+L 460.8 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 58.43 299.32 
+L 450 299.32 
+L 450 37.80224 
+L 58.43 37.80224 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="mb788961b6a" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#mb788961b6a" x="58.43" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- 0 -->
+      <g transform="translate(54.6125 315.438125) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#mb788961b6a" x="116.742733" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- 10 -->
+      <g transform="translate(109.107733 315.438125) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#mb788961b6a" x="175.055465" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- 20 -->
+      <g transform="translate(167.420465 315.438125) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#mb788961b6a" x="233.368198" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- 30 -->
+      <g transform="translate(225.733198 315.438125) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-33" d="M 2597 2516 
+Q 3050 2419 3304 2112 
+Q 3559 1806 3559 1356 
+Q 3559 666 3084 287 
+Q 2609 -91 1734 -91 
+Q 1441 -91 1130 -33 
+Q 819 25 488 141 
+L 488 750 
+Q 750 597 1062 519 
+Q 1375 441 1716 441 
+Q 2309 441 2620 675 
+Q 2931 909 2931 1356 
+Q 2931 1769 2642 2001 
+Q 2353 2234 1838 2234 
+L 1294 2234 
+L 1294 2753 
+L 1863 2753 
+Q 2328 2753 2575 2939 
+Q 2822 3125 2822 3475 
+Q 2822 3834 2567 4026 
+Q 2313 4219 1838 4219 
+Q 1578 4219 1281 4162 
+Q 984 4106 628 3988 
+L 628 4550 
+Q 988 4650 1302 4700 
+Q 1616 4750 1894 4750 
+Q 2613 4750 3031 4423 
+Q 3450 4097 3450 3541 
+Q 3450 3153 3228 2886 
+Q 3006 2619 2597 2516 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-33"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#mb788961b6a" x="291.680931" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- 40 -->
+      <g transform="translate(284.045931 315.438125) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-34" d="M 2419 4116 
+L 825 1625 
+L 2419 1625 
+L 2419 4116 
+z
+M 2253 4666 
+L 3047 4666 
+L 3047 1625 
+L 3713 1625 
+L 3713 1100 
+L 3047 1100 
+L 3047 0 
+L 2419 0 
+L 2419 1100 
+L 313 1100 
+L 313 1709 
+L 2253 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#mb788961b6a" x="349.993663" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- 50 -->
+      <g transform="translate(342.358663 315.438125) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-35" d="M 691 4666 
+L 3169 4666 
+L 3169 4134 
+L 1269 4134 
+L 1269 2991 
+Q 1406 3038 1543 3061 
+Q 1681 3084 1819 3084 
+Q 2600 3084 3056 2656 
+Q 3513 2228 3513 1497 
+Q 3513 744 3044 326 
+Q 2575 -91 1722 -91 
+Q 1428 -91 1123 -41 
+Q 819 9 494 109 
+L 494 744 
+Q 775 591 1075 516 
+Q 1375 441 1709 441 
+Q 2250 441 2565 725 
+Q 2881 1009 2881 1497 
+Q 2881 1984 2565 2268 
+Q 2250 2553 1709 2553 
+Q 1456 2553 1204 2497 
+Q 953 2441 691 2322 
+L 691 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-35"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#mb788961b6a" x="408.306396" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- 60 -->
+      <g transform="translate(400.671396 315.438125) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_8">
+     <!-- Kernel Dispatch [N] -->
+     <g transform="translate(184.391875 332.8755) scale(0.144 -0.144)">
+      <defs>
+       <path id="DejaVuSans-4b" d="M 628 4666 
+L 1259 4666 
+L 1259 2694 
+L 3353 4666 
+L 4166 4666 
+L 1850 2491 
+L 4331 0 
+L 3500 0 
+L 1259 2247 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-44" d="M 1259 4147 
+L 1259 519 
+L 2022 519 
+Q 2988 519 3436 956 
+Q 3884 1394 3884 2338 
+Q 3884 3275 3436 3711 
+Q 2988 4147 2022 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 1925 4666 
+Q 3281 4666 3915 4102 
+Q 4550 3538 4550 2338 
+Q 4550 1131 3912 565 
+Q 3275 0 1925 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-5b" d="M 550 4863 
+L 1875 4863 
+L 1875 4416 
+L 1125 4416 
+L 1125 -397 
+L 1875 -397 
+L 1875 -844 
+L 550 -844 
+L 550 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-4e" d="M 628 4666 
+L 1478 4666 
+L 3547 763 
+L 3547 4666 
+L 4159 4666 
+L 4159 0 
+L 3309 0 
+L 1241 3903 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-5d" d="M 1947 4863 
+L 1947 -844 
+L 622 -844 
+L 622 -397 
+L 1369 -397 
+L 1369 4416 
+L 622 4416 
+L 622 4863 
+L 1947 4863 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-4b"/>
+      <use xlink:href="#DejaVuSans-65" x="60.576172"/>
+      <use xlink:href="#DejaVuSans-72" x="122.099609"/>
+      <use xlink:href="#DejaVuSans-6e" x="161.462891"/>
+      <use xlink:href="#DejaVuSans-65" x="224.841797"/>
+      <use xlink:href="#DejaVuSans-6c" x="286.365234"/>
+      <use xlink:href="#DejaVuSans-20" x="314.148438"/>
+      <use xlink:href="#DejaVuSans-44" x="345.935547"/>
+      <use xlink:href="#DejaVuSans-69" x="422.9375"/>
+      <use xlink:href="#DejaVuSans-73" x="450.720703"/>
+      <use xlink:href="#DejaVuSans-70" x="502.820312"/>
+      <use xlink:href="#DejaVuSans-61" x="566.296875"/>
+      <use xlink:href="#DejaVuSans-74" x="627.576172"/>
+      <use xlink:href="#DejaVuSans-63" x="666.785156"/>
+      <use xlink:href="#DejaVuSans-68" x="721.765625"/>
+      <use xlink:href="#DejaVuSans-20" x="785.144531"/>
+      <use xlink:href="#DejaVuSans-5b" x="816.931641"/>
+      <use xlink:href="#DejaVuSans-4e" x="855.945312"/>
+      <use xlink:href="#DejaVuSans-5d" x="930.75"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_8">
+      <defs>
+       <path id="mc1e45fdb99" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#mc1e45fdb99" x="58.43" y="287.432829" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- 0 -->
+      <g transform="translate(43.795 291.991892) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_9">
+      <g>
+       <use xlink:href="#mc1e45fdb99" x="58.43" y="238.350317" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_10">
+      <!-- 20 -->
+      <g transform="translate(36.16 242.909379) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_10">
+      <g>
+       <use xlink:href="#mc1e45fdb99" x="58.43" y="189.267805" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_11">
+      <!-- 40 -->
+      <g transform="translate(36.16 193.826867) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-34"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_11">
+      <g>
+       <use xlink:href="#mc1e45fdb99" x="58.43" y="140.185293" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_12">
+      <!-- 60 -->
+      <g transform="translate(36.16 144.744355) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-36"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="line2d_12">
+      <g>
+       <use xlink:href="#mc1e45fdb99" x="58.43" y="91.102781" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_13">
+      <!-- 80 -->
+      <g transform="translate(36.16 95.661843) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-38"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_13">
+      <g>
+       <use xlink:href="#mc1e45fdb99" x="58.43" y="42.020268" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_14">
+      <!-- 100 -->
+      <g transform="translate(28.525 46.579331) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-30" x="127.246094"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_15">
+     <!-- Percent -->
+     <g transform="translate(21.53025 195.64437) rotate(-90) scale(0.144 -0.144)">
+      <defs>
+       <path id="DejaVuSans-50" d="M 1259 4147 
+L 1259 2394 
+L 2053 2394 
+Q 2494 2394 2734 2622 
+Q 2975 2850 2975 3272 
+Q 2975 3691 2734 3919 
+Q 2494 4147 2053 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2053 4666 
+Q 2838 4666 3239 4311 
+Q 3641 3956 3641 3272 
+Q 3641 2581 3239 2228 
+Q 2838 1875 2053 1875 
+L 1259 1875 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-50"/>
+      <use xlink:href="#DejaVuSans-65" x="56.677734"/>
+      <use xlink:href="#DejaVuSans-72" x="118.201172"/>
+      <use xlink:href="#DejaVuSans-63" x="157.064453"/>
+      <use xlink:href="#DejaVuSans-65" x="212.044922"/>
+      <use xlink:href="#DejaVuSans-6e" x="273.568359"/>
+      <use xlink:href="#DejaVuSans-74" x="336.947266"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_14">
+    <defs>
+     <path id="mf7e5d00153" d="M -5.5 -0 
+L 5.5 5.5 
+L 5.5 -5.5 
+z
+" style="stroke: #21918c; stroke-linejoin: miter"/>
+    </defs>
+    <g clip-path="url(#pae68a78108)">
+     <use xlink:href="#mf7e5d00153" x="64.261273" y="287.432829" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="70.092547" y="283.598258" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="75.92382" y="279.763687" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="81.755093" y="275.929115" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="87.586366" y="272.094544" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="93.41764" y="268.259973" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="99.248913" y="264.425402" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="105.080186" y="260.59083" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="110.911459" y="256.756259" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="116.742733" y="252.921688" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="122.574006" y="249.087116" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="128.405279" y="245.252545" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="134.236552" y="241.417974" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="140.067826" y="237.583403" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="145.899099" y="233.748831" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="151.730372" y="229.91426" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="157.561646" y="226.079689" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="163.392919" y="222.245118" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="169.224192" y="218.410546" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="175.055465" y="214.575975" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="180.886739" y="210.741404" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="186.718012" y="206.906833" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="192.549285" y="203.072261" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="198.380558" y="199.23769" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="204.211832" y="195.403119" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="210.043105" y="191.568548" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="215.874378" y="187.733976" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="221.705652" y="183.899405" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="227.536925" y="180.064834" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="233.368198" y="176.230263" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="239.199471" y="172.395691" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="245.030745" y="168.56112" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="250.862018" y="168.56112" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="256.693291" y="164.726549" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="262.524564" y="160.891977" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="268.355838" y="157.057406" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="274.187111" y="153.222835" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="280.018384" y="149.388264" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="285.849657" y="145.553692" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="291.680931" y="141.719121" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="297.512204" y="137.88455" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="303.343477" y="134.049979" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="309.174751" y="130.215407" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="315.006024" y="126.380836" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="320.837297" y="122.546265" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="326.66857" y="118.711694" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="332.499844" y="114.877122" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="338.331117" y="111.042551" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="344.16239" y="107.20798" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="349.993663" y="103.373409" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="355.824937" y="99.538837" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="361.65621" y="95.704266" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="367.487483" y="91.869695" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="373.318757" y="88.035124" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="379.15003" y="84.200552" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="384.981303" y="80.365981" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="390.812576" y="76.53141" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="396.64385" y="72.696838" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="402.475123" y="68.862267" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="408.306396" y="65.027696" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="414.137669" y="61.193125" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="419.968943" y="57.358553" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="425.800216" y="53.523982" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     <use xlink:href="#mf7e5d00153" x="431.631489" y="49.689411" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+    </g>
+   </g>
+   <g id="patch_3">
+    <path d="M 58.43 299.32 
+L 58.43 37.80224 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 450 299.32 
+L 450 37.80224 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 58.43 299.32 
+L 450 299.32 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 58.43 37.80224 
+L 450 37.80224 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="legend_1">
+    <g id="patch_7">
+     <path d="M 60.43 32.571885 
+L 448 32.571885 
+Q 450 32.571885 450 30.571885 
+L 450 12.635385 
+Q 450 10.635385 448 10.635385 
+L 60.43 10.635385 
+Q 58.43 10.635385 58.43 12.635385 
+L 58.43 30.571885 
+Q 58.43 32.571885 60.43 32.571885 
+z
+" style="fill: #ffffff; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter"/>
+    </g>
+    <g id="line2d_15">
+     <g>
+      <use xlink:href="#mf7e5d00153" x="72.43" y="22.077135" style="fill-opacity: 0; stroke: #21918c; stroke-linejoin: miter"/>
+     </g>
+    </g>
+    <g id="text_16">
+     <!-- Conflict Rate -->
+     <g transform="translate(90.43 25.577135) scale(0.144 -0.144)">
+      <defs>
+       <path id="DejaVuSans-43" d="M 4122 4306 
+L 4122 3641 
+Q 3803 3938 3442 4084 
+Q 3081 4231 2675 4231 
+Q 1875 4231 1450 3742 
+Q 1025 3253 1025 2328 
+Q 1025 1406 1450 917 
+Q 1875 428 2675 428 
+Q 3081 428 3442 575 
+Q 3803 722 4122 1019 
+L 4122 359 
+Q 3791 134 3420 21 
+Q 3050 -91 2638 -91 
+Q 1578 -91 968 557 
+Q 359 1206 359 2328 
+Q 359 3453 968 4101 
+Q 1578 4750 2638 4750 
+Q 3056 4750 3426 4639 
+Q 3797 4528 4122 4306 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-52" d="M 2841 2188 
+Q 3044 2119 3236 1894 
+Q 3428 1669 3622 1275 
+L 4263 0 
+L 3584 0 
+L 2988 1197 
+Q 2756 1666 2539 1819 
+Q 2322 1972 1947 1972 
+L 1259 1972 
+L 1259 0 
+L 628 0 
+L 628 4666 
+L 2053 4666 
+Q 2853 4666 3247 4331 
+Q 3641 3997 3641 3322 
+Q 3641 2881 3436 2590 
+Q 3231 2300 2841 2188 
+z
+M 1259 4147 
+L 1259 2491 
+L 2053 2491 
+Q 2509 2491 2742 2702 
+Q 2975 2913 2975 3322 
+Q 2975 3731 2742 3939 
+Q 2509 4147 2053 4147 
+L 1259 4147 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-43"/>
+      <use xlink:href="#DejaVuSans-6f" x="69.824219"/>
+      <use xlink:href="#DejaVuSans-6e" x="131.005859"/>
+      <use xlink:href="#DejaVuSans-66" x="194.384766"/>
+      <use xlink:href="#DejaVuSans-6c" x="229.589844"/>
+      <use xlink:href="#DejaVuSans-69" x="257.373047"/>
+      <use xlink:href="#DejaVuSans-63" x="285.15625"/>
+      <use xlink:href="#DejaVuSans-74" x="340.136719"/>
+      <use xlink:href="#DejaVuSans-20" x="379.345703"/>
+      <use xlink:href="#DejaVuSans-52" x="411.132812"/>
+      <use xlink:href="#DejaVuSans-61" x="478.365234"/>
+      <use xlink:href="#DejaVuSans-74" x="539.644531"/>
+      <use xlink:href="#DejaVuSans-65" x="578.853516"/>
+     </g>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="pae68a78108">
+   <rect x="58.43" y="37.80224" width="391.57" height="261.51776"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/src/docs/images/ldsconflicts.png b/src/docs/images/ldsconflicts.png
new file mode 100644
index 000000000..77c093858
Binary files /dev/null and b/src/docs/images/ldsconflicts.png differ
diff --git a/src/docs/images/ldsconflicts.svg b/src/docs/images/ldsconflicts.svg
new file mode 100644
index 000000000..f4a2f17d1
--- /dev/null
+++ b/src/docs/images/ldsconflicts.svg
@@ -0,0 +1,1145 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="460.8pt" height="345.6pt" viewBox="0 0 460.8 345.6" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2023-08-17T18:14:36.907658</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.7.1, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 345.6 
+L 460.8 345.6 
+L 460.8 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 50.78 299.32 
+L 450 299.32 
+L 450 75.16224 
+L 50.78 75.16224 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="m52dc845b6f" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m52dc845b6f" x="50.78" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- 0.0 -->
+      <g transform="translate(41.238125 315.438125) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-2e" d="M 684 794 
+L 1344 794 
+L 1344 0 
+L 684 0 
+L 684 794 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-30"/>
+       <use xlink:href="#DejaVuSans-2e" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-30" x="95.410156"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#m52dc845b6f" x="98.419618" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- 2.5 -->
+      <g transform="translate(88.877743 315.438125) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-35" d="M 691 4666 
+L 3169 4666 
+L 3169 4134 
+L 1269 4134 
+L 1269 2991 
+Q 1406 3038 1543 3061 
+Q 1681 3084 1819 3084 
+Q 2600 3084 3056 2656 
+Q 3513 2228 3513 1497 
+Q 3513 744 3044 326 
+Q 2575 -91 1722 -91 
+Q 1428 -91 1123 -41 
+Q 819 9 494 109 
+L 494 744 
+Q 775 591 1075 516 
+Q 1375 441 1709 441 
+Q 2250 441 2565 725 
+Q 2881 1009 2881 1497 
+Q 2881 1984 2565 2268 
+Q 2250 2553 1709 2553 
+Q 1456 2553 1204 2497 
+Q 953 2441 691 2322 
+L 691 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-2e" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-35" x="95.410156"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#m52dc845b6f" x="146.059236" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- 5.0 -->
+      <g transform="translate(136.517361 315.438125) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-35"/>
+       <use xlink:href="#DejaVuSans-2e" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-30" x="95.410156"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#m52dc845b6f" x="193.698854" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- 7.5 -->
+      <g transform="translate(184.156979 315.438125) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-37" d="M 525 4666 
+L 3525 4666 
+L 3525 4397 
+L 1831 0 
+L 1172 0 
+L 2766 4134 
+L 525 4134 
+L 525 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-37"/>
+       <use xlink:href="#DejaVuSans-2e" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-35" x="95.410156"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#m52dc845b6f" x="241.338473" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- 10.0 -->
+      <g transform="translate(227.979098 315.438125) scale(0.12 -0.12)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-2e" x="127.246094"/>
+       <use xlink:href="#DejaVuSans-30" x="159.033203"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#m52dc845b6f" x="288.978091" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- 12.5 -->
+      <g transform="translate(275.618716 315.438125) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-32" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-2e" x="127.246094"/>
+       <use xlink:href="#DejaVuSans-35" x="159.033203"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#m52dc845b6f" x="336.617709" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- 15.0 -->
+      <g transform="translate(323.258334 315.438125) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-35" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-2e" x="127.246094"/>
+       <use xlink:href="#DejaVuSans-30" x="159.033203"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="line2d_8">
+      <g>
+       <use xlink:href="#m52dc845b6f" x="384.257327" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- 17.5 -->
+      <g transform="translate(370.897952 315.438125) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-37" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-2e" x="127.246094"/>
+       <use xlink:href="#DejaVuSans-35" x="159.033203"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_9">
+      <g>
+       <use xlink:href="#m52dc845b6f" x="431.896945" y="299.32" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- 20.0 -->
+      <g transform="translate(418.53757 315.438125) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+       <use xlink:href="#DejaVuSans-2e" x="127.246094"/>
+       <use xlink:href="#DejaVuSans-30" x="159.033203"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_10">
+     <!-- Kernel Dispatch [N] -->
+     <g transform="translate(180.566875 332.8755) scale(0.144 -0.144)">
+      <defs>
+       <path id="DejaVuSans-4b" d="M 628 4666 
+L 1259 4666 
+L 1259 2694 
+L 3353 4666 
+L 4166 4666 
+L 1850 2491 
+L 4331 0 
+L 3500 0 
+L 1259 2247 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-44" d="M 1259 4147 
+L 1259 519 
+L 2022 519 
+Q 2988 519 3436 956 
+Q 3884 1394 3884 2338 
+Q 3884 3275 3436 3711 
+Q 2988 4147 2022 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 1925 4666 
+Q 3281 4666 3915 4102 
+Q 4550 3538 4550 2338 
+Q 4550 1131 3912 565 
+Q 3275 0 1925 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-5b" d="M 550 4863 
+L 1875 4863 
+L 1875 4416 
+L 1125 4416 
+L 1125 -397 
+L 1875 -397 
+L 1875 -844 
+L 550 -844 
+L 550 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-4e" d="M 628 4666 
+L 1478 4666 
+L 3547 763 
+L 3547 4666 
+L 4159 4666 
+L 4159 0 
+L 3309 0 
+L 1241 3903 
+L 1241 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-5d" d="M 1947 4863 
+L 1947 -844 
+L 622 -844 
+L 622 -397 
+L 1369 -397 
+L 1369 4416 
+L 622 4416 
+L 622 4863 
+L 1947 4863 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-4b"/>
+      <use xlink:href="#DejaVuSans-65" x="60.576172"/>
+      <use xlink:href="#DejaVuSans-72" x="122.099609"/>
+      <use xlink:href="#DejaVuSans-6e" x="161.462891"/>
+      <use xlink:href="#DejaVuSans-65" x="224.841797"/>
+      <use xlink:href="#DejaVuSans-6c" x="286.365234"/>
+      <use xlink:href="#DejaVuSans-20" x="314.148438"/>
+      <use xlink:href="#DejaVuSans-44" x="345.935547"/>
+      <use xlink:href="#DejaVuSans-69" x="422.9375"/>
+      <use xlink:href="#DejaVuSans-73" x="450.720703"/>
+      <use xlink:href="#DejaVuSans-70" x="502.820312"/>
+      <use xlink:href="#DejaVuSans-61" x="566.296875"/>
+      <use xlink:href="#DejaVuSans-74" x="627.576172"/>
+      <use xlink:href="#DejaVuSans-63" x="666.785156"/>
+      <use xlink:href="#DejaVuSans-68" x="721.765625"/>
+      <use xlink:href="#DejaVuSans-20" x="785.144531"/>
+      <use xlink:href="#DejaVuSans-5b" x="816.931641"/>
+      <use xlink:href="#DejaVuSans-4e" x="855.945312"/>
+      <use xlink:href="#DejaVuSans-5d" x="930.75"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_10">
+      <defs>
+       <path id="ma58fb85e7a" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#ma58fb85e7a" x="50.78" y="289.131011" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_11">
+      <!-- 0 -->
+      <g transform="translate(36.145 293.690073) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_11">
+      <g>
+       <use xlink:href="#ma58fb85e7a" x="50.78" y="240.612015" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_12">
+      <!-- 5 -->
+      <g transform="translate(36.145 245.171078) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-35"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_12">
+      <g>
+       <use xlink:href="#ma58fb85e7a" x="50.78" y="192.09302" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_13">
+      <!-- 10 -->
+      <g transform="translate(28.51 196.652082) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_13">
+      <g>
+       <use xlink:href="#ma58fb85e7a" x="50.78" y="143.574024" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_14">
+      <!-- 15 -->
+      <g transform="translate(28.51 148.133086) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-31"/>
+       <use xlink:href="#DejaVuSans-35" x="63.623047"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="line2d_14">
+      <g>
+       <use xlink:href="#ma58fb85e7a" x="50.78" y="95.055028" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_15">
+      <!-- 20 -->
+      <g transform="translate(28.51 99.614091) scale(0.12 -0.12)">
+       <use xlink:href="#DejaVuSans-32"/>
+       <use xlink:href="#DejaVuSans-30" x="63.623047"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_16">
+     <!-- Cycles -->
+     <g transform="translate(21.51525 210.67037) rotate(-90) scale(0.144 -0.144)">
+      <defs>
+       <path id="DejaVuSans-43" d="M 4122 4306 
+L 4122 3641 
+Q 3803 3938 3442 4084 
+Q 3081 4231 2675 4231 
+Q 1875 4231 1450 3742 
+Q 1025 3253 1025 2328 
+Q 1025 1406 1450 917 
+Q 1875 428 2675 428 
+Q 3081 428 3442 575 
+Q 3803 722 4122 1019 
+L 4122 359 
+Q 3791 134 3420 21 
+Q 3050 -91 2638 -91 
+Q 1578 -91 968 557 
+Q 359 1206 359 2328 
+Q 359 3453 968 4101 
+Q 1578 4750 2638 4750 
+Q 3056 4750 3426 4639 
+Q 3797 4528 4122 4306 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-43"/>
+      <use xlink:href="#DejaVuSans-79" x="69.824219"/>
+      <use xlink:href="#DejaVuSans-63" x="129.003906"/>
+      <use xlink:href="#DejaVuSans-6c" x="183.984375"/>
+      <use xlink:href="#DejaVuSans-65" x="211.767578"/>
+      <use xlink:href="#DejaVuSans-73" x="273.291016"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_15">
+    <defs>
+     <path id="mecd52b7ca5" d="M 0 2 
+C 0.530406 2 1.03916 1.789267 1.414214 1.414214 
+C 1.789267 1.03916 2 0.530406 2 0 
+C 2 -0.530406 1.789267 -1.03916 1.414214 -1.414214 
+C 1.03916 -1.789267 0.530406 -2 0 -2 
+C -0.530406 -2 -1.03916 -1.789267 -1.414214 -1.414214 
+C -1.789267 -1.03916 -2 -0.530406 -2 0 
+C -2 0.530406 -1.789267 1.03916 -1.414214 1.414214 
+C -1.03916 1.789267 -0.530406 2 0 2 
+z
+" style="stroke: #440154"/>
+    </defs>
+    <g clip-path="url(#pbd5adeac22)">
+     <use xlink:href="#mecd52b7ca5" x="69.835847" y="289.131011" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="88.891695" y="279.427212" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="107.947542" y="269.723413" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="127.003389" y="260.019614" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="146.059236" y="250.315814" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="165.115084" y="240.612015" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="184.170931" y="230.908216" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="203.226778" y="221.204417" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="222.282625" y="211.500618" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="241.338473" y="201.796819" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="260.39432" y="192.09302" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="279.450167" y="182.38922" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="298.506014" y="172.685421" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="317.561862" y="162.981622" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="336.617709" y="153.277823" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="355.673556" y="143.574024" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="374.729403" y="133.870225" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="393.785251" y="124.166426" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="412.841098" y="114.462626" style="fill-opacity: 0; stroke: #440154"/>
+     <use xlink:href="#mecd52b7ca5" x="431.896945" y="104.758827" style="fill-opacity: 0; stroke: #440154"/>
+    </g>
+   </g>
+   <g id="line2d_16">
+    <defs>
+     <path id="me712094ef6" d="M -5.5 -0 
+L 5.5 5.5 
+L 5.5 -5.5 
+z
+" style="stroke: #31688e; stroke-linejoin: miter"/>
+    </defs>
+    <g clip-path="url(#pbd5adeac22)">
+     <use xlink:href="#me712094ef6" x="69.835847" y="269.723413" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="88.891695" y="260.019614" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="107.947542" y="250.315814" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="127.003389" y="240.612015" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="146.059236" y="230.908216" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="165.115084" y="221.204417" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="184.170931" y="211.500618" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="203.226778" y="201.796819" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="222.282625" y="192.09302" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="241.338473" y="182.38922" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="260.39432" y="172.685421" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="279.450167" y="162.981622" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="298.506014" y="153.277823" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="317.561862" y="143.574024" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="336.617709" y="133.870225" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="355.673556" y="124.166426" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="374.729403" y="114.462626" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="393.785251" y="104.758827" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="412.841098" y="95.055028" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     <use xlink:href="#me712094ef6" x="431.896945" y="85.351229" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+    </g>
+   </g>
+   <g id="line2d_17">
+    <defs>
+     <path id="m3997da41a5" d="M -7 7 
+L 7 7 
+L 7 -7 
+L -7 -7 
+z
+" style="stroke: #35b779; stroke-linejoin: miter"/>
+    </defs>
+    <g clip-path="url(#pbd5adeac22)">
+     <use xlink:href="#m3997da41a5" x="69.835847" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="88.891695" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="107.947542" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="127.003389" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="146.059236" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="165.115084" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="184.170931" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="203.226778" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="222.282625" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="241.338473" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="260.39432" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="279.450167" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="298.506014" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="317.561862" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="336.617709" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="355.673556" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="374.729403" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="393.785251" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="412.841098" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     <use xlink:href="#m3997da41a5" x="431.896945" y="269.723413" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+    </g>
+   </g>
+   <g id="patch_3">
+    <path d="M 50.78 299.32 
+L 50.78 75.16224 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 450 299.32 
+L 450 75.16224 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 50.78 299.32 
+L 450 299.32 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 50.78 75.16224 
+L 450 75.16224 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="legend_1">
+    <g id="patch_7">
+     <path d="M 52.78 70.679085 
+L 448 70.679085 
+Q 450 70.679085 450 68.679085 
+L 450 12.869585 
+Q 450 10.869585 448 10.869585 
+L 52.78 10.869585 
+Q 50.78 10.869585 50.78 12.869585 
+L 50.78 68.679085 
+Q 50.78 70.679085 52.78 70.679085 
+z
+" style="fill: #ffffff; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter"/>
+    </g>
+    <g id="line2d_18">
+     <g>
+      <use xlink:href="#mecd52b7ca5" x="64.78" y="22.311335" style="fill-opacity: 0; stroke: #440154"/>
+     </g>
+    </g>
+    <g id="text_17">
+     <!-- Bank Conflicts -->
+     <g transform="translate(82.78 25.811335) scale(0.144 -0.144)">
+      <defs>
+       <path id="DejaVuSans-42" d="M 1259 2228 
+L 1259 519 
+L 2272 519 
+Q 2781 519 3026 730 
+Q 3272 941 3272 1375 
+Q 3272 1813 3026 2020 
+Q 2781 2228 2272 2228 
+L 1259 2228 
+z
+M 1259 4147 
+L 1259 2741 
+L 2194 2741 
+Q 2656 2741 2882 2914 
+Q 3109 3088 3109 3444 
+Q 3109 3797 2882 3972 
+Q 2656 4147 2194 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2241 4666 
+Q 2963 4666 3353 4366 
+Q 3744 4066 3744 3513 
+Q 3744 3084 3544 2831 
+Q 3344 2578 2956 2516 
+Q 3422 2416 3680 2098 
+Q 3938 1781 3938 1306 
+Q 3938 681 3513 340 
+Q 3088 0 2303 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6b" d="M 581 4863 
+L 1159 4863 
+L 1159 1991 
+L 2875 3500 
+L 3609 3500 
+L 1753 1863 
+L 3688 0 
+L 2938 0 
+L 1159 1709 
+L 1159 0 
+L 581 0 
+L 581 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-42"/>
+      <use xlink:href="#DejaVuSans-61" x="68.603516"/>
+      <use xlink:href="#DejaVuSans-6e" x="129.882812"/>
+      <use xlink:href="#DejaVuSans-6b" x="193.261719"/>
+      <use xlink:href="#DejaVuSans-20" x="251.171875"/>
+      <use xlink:href="#DejaVuSans-43" x="282.958984"/>
+      <use xlink:href="#DejaVuSans-6f" x="352.783203"/>
+      <use xlink:href="#DejaVuSans-6e" x="413.964844"/>
+      <use xlink:href="#DejaVuSans-66" x="477.34375"/>
+      <use xlink:href="#DejaVuSans-6c" x="512.548828"/>
+      <use xlink:href="#DejaVuSans-69" x="540.332031"/>
+      <use xlink:href="#DejaVuSans-63" x="568.115234"/>
+      <use xlink:href="#DejaVuSans-74" x="623.095703"/>
+      <use xlink:href="#DejaVuSans-73" x="662.304688"/>
+     </g>
+    </g>
+    <g id="line2d_19">
+     <g>
+      <use xlink:href="#me712094ef6" x="64.78" y="41.247835" style="fill-opacity: 0; stroke: #31688e; stroke-linejoin: miter"/>
+     </g>
+    </g>
+    <g id="text_18">
+     <!-- Index Accesses -->
+     <g transform="translate(82.78 44.747835) scale(0.144 -0.144)">
+      <defs>
+       <path id="DejaVuSans-49" d="M 628 4666 
+L 1259 4666 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-64" d="M 2906 2969 
+L 2906 4863 
+L 3481 4863 
+L 3481 0 
+L 2906 0 
+L 2906 525 
+Q 2725 213 2448 61 
+Q 2172 -91 1784 -91 
+Q 1150 -91 751 415 
+Q 353 922 353 1747 
+Q 353 2572 751 3078 
+Q 1150 3584 1784 3584 
+Q 2172 3584 2448 3432 
+Q 2725 3281 2906 2969 
+z
+M 947 1747 
+Q 947 1113 1208 752 
+Q 1469 391 1925 391 
+Q 2381 391 2643 752 
+Q 2906 1113 2906 1747 
+Q 2906 2381 2643 2742 
+Q 2381 3103 1925 3103 
+Q 1469 3103 1208 2742 
+Q 947 2381 947 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-78" d="M 3513 3500 
+L 2247 1797 
+L 3578 0 
+L 2900 0 
+L 1881 1375 
+L 863 0 
+L 184 0 
+L 1544 1831 
+L 300 3500 
+L 978 3500 
+L 1906 2253 
+L 2834 3500 
+L 3513 3500 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-49"/>
+      <use xlink:href="#DejaVuSans-6e" x="29.492188"/>
+      <use xlink:href="#DejaVuSans-64" x="92.871094"/>
+      <use xlink:href="#DejaVuSans-65" x="156.347656"/>
+      <use xlink:href="#DejaVuSans-78" x="216.121094"/>
+      <use xlink:href="#DejaVuSans-20" x="275.300781"/>
+      <use xlink:href="#DejaVuSans-41" x="307.087891"/>
+      <use xlink:href="#DejaVuSans-63" x="373.746094"/>
+      <use xlink:href="#DejaVuSans-63" x="428.726562"/>
+      <use xlink:href="#DejaVuSans-65" x="483.707031"/>
+      <use xlink:href="#DejaVuSans-73" x="545.230469"/>
+      <use xlink:href="#DejaVuSans-73" x="597.330078"/>
+      <use xlink:href="#DejaVuSans-65" x="649.429688"/>
+      <use xlink:href="#DejaVuSans-73" x="710.953125"/>
+     </g>
+    </g>
+    <g id="line2d_20">
+     <g>
+      <use xlink:href="#m3997da41a5" x="64.78" y="60.184335" style="fill-opacity: 0; stroke: #35b779; stroke-linejoin: miter"/>
+     </g>
+    </g>
+    <g id="text_19">
+     <!-- Index Accesses - Bank Conflicts -->
+     <g transform="translate(82.78 63.684335) scale(0.144 -0.144)">
+      <defs>
+       <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-49"/>
+      <use xlink:href="#DejaVuSans-6e" x="29.492188"/>
+      <use xlink:href="#DejaVuSans-64" x="92.871094"/>
+      <use xlink:href="#DejaVuSans-65" x="156.347656"/>
+      <use xlink:href="#DejaVuSans-78" x="216.121094"/>
+      <use xlink:href="#DejaVuSans-20" x="275.300781"/>
+      <use xlink:href="#DejaVuSans-41" x="307.087891"/>
+      <use xlink:href="#DejaVuSans-63" x="373.746094"/>
+      <use xlink:href="#DejaVuSans-63" x="428.726562"/>
+      <use xlink:href="#DejaVuSans-65" x="483.707031"/>
+      <use xlink:href="#DejaVuSans-73" x="545.230469"/>
+      <use xlink:href="#DejaVuSans-73" x="597.330078"/>
+      <use xlink:href="#DejaVuSans-65" x="649.429688"/>
+      <use xlink:href="#DejaVuSans-73" x="710.953125"/>
+      <use xlink:href="#DejaVuSans-20" x="763.052734"/>
+      <use xlink:href="#DejaVuSans-2d" x="794.839844"/>
+      <use xlink:href="#DejaVuSans-20" x="830.923828"/>
+      <use xlink:href="#DejaVuSans-42" x="862.710938"/>
+      <use xlink:href="#DejaVuSans-61" x="931.314453"/>
+      <use xlink:href="#DejaVuSans-6e" x="992.59375"/>
+      <use xlink:href="#DejaVuSans-6b" x="1055.972656"/>
+      <use xlink:href="#DejaVuSans-20" x="1113.882812"/>
+      <use xlink:href="#DejaVuSans-43" x="1145.669922"/>
+      <use xlink:href="#DejaVuSans-6f" x="1215.494141"/>
+      <use xlink:href="#DejaVuSans-6e" x="1276.675781"/>
+      <use xlink:href="#DejaVuSans-66" x="1340.054688"/>
+      <use xlink:href="#DejaVuSans-6c" x="1375.259766"/>
+      <use xlink:href="#DejaVuSans-69" x="1403.042969"/>
+      <use xlink:href="#DejaVuSans-63" x="1430.826172"/>
+      <use xlink:href="#DejaVuSans-74" x="1485.806641"/>
+      <use xlink:href="#DejaVuSans-73" x="1525.015625"/>
+     </g>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="pbd5adeac22">
+   <rect x="50.78" y="75.16224" width="399.22" height="224.15776"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/src/docs/images/memory-chart_panel.png b/src/docs/images/memory-chart_panel.png
new file mode 100644
index 000000000..1091a5032
Binary files /dev/null and b/src/docs/images/memory-chart_panel.png differ
diff --git a/src/docs/images/nosplit.png b/src/docs/images/nosplit.png
new file mode 100644
index 000000000..a8e5f0164
Binary files /dev/null and b/src/docs/images/nosplit.png differ
diff --git a/src/docs/images/nosplit.svg b/src/docs/images/nosplit.svg
new file mode 100644
index 000000000..d0d9606be
--- /dev/null
+++ b/src/docs/images/nosplit.svg
@@ -0,0 +1,71 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   width="23.89308mm"
+   height="24.000153mm"
+   viewBox="0 0 23.89308 24.000153"
+   version="1.1"
+   id="svg370"
+   inkscape:version="1.1.2 (0a00cf5339, 2022-02-04)"
+   sodipodi:docname="nosplit.svg"
+   inkscape:export-filename="/home/nick/Documents/software_repos/omniperf/src/docs/images/nosplit.png"
+   inkscape:export-xdpi="180"
+   inkscape:export-ydpi="180"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <sodipodi:namedview
+     id="namedview372"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:document-units="mm"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:zoom="6.2217172"
+     inkscape:cx="-18.242552"
+     inkscape:cy="62.764023"
+     inkscape:window-width="2490"
+     inkscape:window-height="1376"
+     inkscape:window-x="70"
+     inkscape:window-y="27"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="layer1" />
+  <defs
+     id="defs367" />
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-30.085626,-76.587497)">
+    <path
+       style="fill:none;stroke:#000000;stroke-width:2.52;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       d="m 30.085626,88.555587 h 23.89308"
+       id="path69449-3-1"
+       inkscape:export-filename="/home/nick/Documents/software_repos/omniperf/src/docs/images/nosplit.png"
+       inkscape:export-xdpi="180"
+       inkscape:export-ydpi="180" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:2.52;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       d="m 34.808926,76.587497 v 10.78438"
+       id="path69449-3-1-6"
+       inkscape:export-filename="/home/nick/Documents/software_repos/omniperf/src/docs/images/nosplit.png"
+       inkscape:export-xdpi="180"
+       inkscape:export-ydpi="180" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:2.52;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       d="M 48.947196,89.803267 V 100.58765"
+       id="path69449-3-1-6-7"
+       inkscape:export-filename="/home/nick/Documents/software_repos/omniperf/src/docs/images/nosplit.png"
+       inkscape:export-xdpi="180"
+       inkscape:export-ydpi="180" />
+  </g>
+</svg>
diff --git a/src/docs/images/roofline_panel.png b/src/docs/images/roofline_panel.png
new file mode 100644
index 000000000..47ee9bddb
Binary files /dev/null and b/src/docs/images/roofline_panel.png differ
diff --git a/src/docs/images/selayout.png b/src/docs/images/selayout.png
new file mode 100644
index 000000000..73aa2b49d
Binary files /dev/null and b/src/docs/images/selayout.png differ
diff --git a/src/docs/images/sl1d-cache-accesses_panel.png b/src/docs/images/sl1d-cache-accesses_panel.png
new file mode 100644
index 000000000..3605cce8a
Binary files /dev/null and b/src/docs/images/sl1d-cache-accesses_panel.png differ
diff --git a/src/docs/images/sl1d-l12-interface_panel.png b/src/docs/images/sl1d-l12-interface_panel.png
new file mode 100644
index 000000000..5c3480ac9
Binary files /dev/null and b/src/docs/images/sl1d-l12-interface_panel.png differ
diff --git a/src/docs/images/sl1d-sol_panel.png b/src/docs/images/sl1d-sol_panel.png
new file mode 100644
index 000000000..92fa5a1a4
Binary files /dev/null and b/src/docs/images/sl1d-sol_panel.png differ
diff --git a/src/docs/images/sol_panel.png b/src/docs/images/sol_panel.png
new file mode 100644
index 000000000..f456500e0
Binary files /dev/null and b/src/docs/images/sol_panel.png differ
diff --git a/src/docs/images/spi-resource-allocation_panel.png b/src/docs/images/spi-resource-allocation_panel.png
new file mode 100644
index 000000000..bee869ad1
Binary files /dev/null and b/src/docs/images/spi-resource-allocation_panel.png differ
diff --git a/src/docs/images/spi-stats_panel.png b/src/docs/images/spi-stats_panel.png
new file mode 100644
index 000000000..19c7ad364
Binary files /dev/null and b/src/docs/images/spi-stats_panel.png differ
diff --git a/src/docs/images/split.png b/src/docs/images/split.png
new file mode 100644
index 000000000..cca71eb2a
Binary files /dev/null and b/src/docs/images/split.png differ
diff --git a/src/docs/images/split.svg b/src/docs/images/split.svg
new file mode 100644
index 000000000..b033a9e11
--- /dev/null
+++ b/src/docs/images/split.svg
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   width="23.89308mm"
+   height="23.893078mm"
+   viewBox="0 0 23.89308 23.893078"
+   version="1.1"
+   id="svg370"
+   inkscape:version="1.1.2 (0a00cf5339, 2022-02-04)"
+   sodipodi:docname="split.svg"
+   inkscape:export-filename="/home/nick/Documents/software_repos/omniperf/src/docs/images/split.png"
+   inkscape:export-xdpi="180"
+   inkscape:export-ydpi="180"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <sodipodi:namedview
+     id="namedview372"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:document-units="mm"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:zoom="24.886869"
+     inkscape:cx="30.397556"
+     inkscape:cy="53.984292"
+     inkscape:window-width="2490"
+     inkscape:window-height="1376"
+     inkscape:window-x="70"
+     inkscape:window-y="27"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="layer1" />
+  <defs
+     id="defs367" />
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-30.085626,-76.637833)">
+    <path
+       style="fill:none;stroke:#000000;stroke-width:2.52;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       d="m 30.085626,88.584373 h 23.89308"
+       id="path69449-3-1"
+       inkscape:export-filename="/home/nick/Documents/software_repos/omniperf/src/docs/images/split.png"
+       inkscape:export-xdpi="180"
+       inkscape:export-ydpi="180" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:2.52;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       d="M 42.032166,76.637833 V 100.53091"
+       id="path69449-3-1-3"
+       inkscape:export-filename="/home/nick/Documents/software_repos/omniperf/src/docs/images/split.png"
+       inkscape:export-xdpi="180"
+       inkscape:export-ydpi="180" />
+  </g>
+</svg>
diff --git a/src/docs/images/system-info_panel.png b/src/docs/images/system-info_panel.png
new file mode 100644
index 000000000..5a5fa0118
Binary files /dev/null and b/src/docs/images/system-info_panel.png differ
diff --git a/src/docs/images/ta_panel.png b/src/docs/images/ta_panel.png
new file mode 100644
index 000000000..2f08f9a6b
Binary files /dev/null and b/src/docs/images/ta_panel.png differ
diff --git a/src/docs/images/td_panel.png b/src/docs/images/td_panel.png
new file mode 100644
index 000000000..819407515
Binary files /dev/null and b/src/docs/images/td_panel.png differ
diff --git a/src/docs/images/top-stat_panel.png b/src/docs/images/top-stat_panel.png
new file mode 100644
index 000000000..5e3dddca2
Binary files /dev/null and b/src/docs/images/top-stat_panel.png differ
diff --git a/src/docs/images/uncached.png b/src/docs/images/uncached.png
new file mode 100644
index 000000000..f770a1b29
Binary files /dev/null and b/src/docs/images/uncached.png differ
diff --git a/src/docs/images/uncached.svg b/src/docs/images/uncached.svg
new file mode 100644
index 000000000..53affd4fc
--- /dev/null
+++ b/src/docs/images/uncached.svg
@@ -0,0 +1,125 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   width="27.472563mm"
+   height="17.499945mm"
+   viewBox="0 0 27.472563 17.499945"
+   version="1.1"
+   id="svg370"
+   inkscape:version="1.1.2 (0a00cf5339, 2022-02-04)"
+   sodipodi:docname="uncached.svg"
+   inkscape:export-filename="/home/nick/Documents/software_repos/omniperf/src/docs/images/uncached.png"
+   inkscape:export-xdpi="180"
+   inkscape:export-ydpi="180"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <sodipodi:namedview
+     id="namedview372"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:document-units="mm"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:zoom="6.2217172"
+     inkscape:cx="-18.242552"
+     inkscape:cy="72.086208"
+     inkscape:window-width="2490"
+     inkscape:window-height="1376"
+     inkscape:window-x="70"
+     inkscape:window-y="27"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="layer1" />
+  <defs
+     id="defs367">
+    <marker
+       style="overflow:visible"
+       id="TriangleOutS"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="TriangleOutS"
+       inkscape:isstock="true">
+      <path
+         transform="scale(0.2)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 5.77,0 -2.88,5 V -5 Z"
+         id="path1132" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="marker1262"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow2Mend"
+       inkscape:isstock="true">
+      <path
+         transform="scale(-0.6)"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
+         id="path1260" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow2Mend"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow2Mend"
+       inkscape:isstock="true">
+      <path
+         transform="scale(-0.6)"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
+         id="path1011" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Lend"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow1Lend"
+       inkscape:isstock="true">
+      <path
+         transform="matrix(-0.8,0,0,-0.8,-10,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path987" />
+    </marker>
+  </defs>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-30.085626,-74.157956)">
+    <path
+       style="fill:none;stroke:#000000;stroke-width:2.52;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:2.52, 2.52;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#TriangleOutS)"
+       d="m 30.085626,88.555587 h 23.89308"
+       id="path69449-3-1"
+       inkscape:export-filename="/home/nick/Documents/software_repos/omniperf/src/docs/images/nosplit.png"
+       inkscape:export-xdpi="180"
+       inkscape:export-ydpi="180" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:10.5833px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="37.700855"
+       y="82.012749"
+       id="text7915"><tspan
+         sodipodi:role="line"
+         id="tspan7913"
+         style="stroke-width:0.264583"
+         x="37.700855"
+         y="82.012749">x2</tspan></text>
+  </g>
+</svg>
diff --git a/src/docs/images/vl1d-addr-translation_panel.png b/src/docs/images/vl1d-addr-translation_panel.png
new file mode 100644
index 000000000..0fb4aaf07
Binary files /dev/null and b/src/docs/images/vl1d-addr-translation_panel.png differ
diff --git a/src/docs/images/vl1d-cache-accesses_panel.png b/src/docs/images/vl1d-cache-accesses_panel.png
new file mode 100644
index 000000000..5259b2214
Binary files /dev/null and b/src/docs/images/vl1d-cache-accesses_panel.png differ
diff --git a/src/docs/images/vl1d-cache-stalls_panel.png b/src/docs/images/vl1d-cache-stalls_panel.png
new file mode 100644
index 000000000..61e09c915
Binary files /dev/null and b/src/docs/images/vl1d-cache-stalls_panel.png differ
diff --git a/src/docs/images/vl1d-l2-transactions_panel.png b/src/docs/images/vl1d-l2-transactions_panel.png
new file mode 100644
index 000000000..51875e516
Binary files /dev/null and b/src/docs/images/vl1d-l2-transactions_panel.png differ
diff --git a/src/docs/images/vl1d-sol_panel.png b/src/docs/images/vl1d-sol_panel.png
new file mode 100644
index 000000000..5c2485d0d
Binary files /dev/null and b/src/docs/images/vl1d-sol_panel.png differ
diff --git a/src/docs/images/wavefront-launch-stats_panel.png b/src/docs/images/wavefront-launch-stats_panel.png
new file mode 100644
index 000000000..38e4517f3
Binary files /dev/null and b/src/docs/images/wavefront-launch-stats_panel.png differ
diff --git a/src/docs/images/wavefront-runtime-stats_panel.png b/src/docs/images/wavefront-runtime-stats_panel.png
new file mode 100644
index 000000000..517d461d3
Binary files /dev/null and b/src/docs/images/wavefront-runtime-stats_panel.png differ
diff --git a/src/docs/index.md b/src/docs/index.md
index 931718107..4f3f7c107 100644
--- a/src/docs/index.md
+++ b/src/docs/index.md
@@ -12,5 +12,6 @@
    getting_started
    profiling
    analysis
+   performance_model
    faq
 ```
diff --git a/src/docs/installation.md b/src/docs/installation.md
index af8c21bac..caf3e5cd2 100644
--- a/src/docs/installation.md
+++ b/src/docs/installation.md
@@ -17,7 +17,7 @@ Omniperf is broken into two installation components:
    - Mongo DB backend + Grafana instance
    - Packaged in a Docker container for easy setup
 
-Determine what you need to install based on how you'd like to interact with Omniperf. See the decision tree below to help determine what installation is right for you.
+Determine what you need to install based on how you would like to interact with Omniperf. See the decision tree below to help determine what installation is right for you.
 
 ![Omniperf Installtion Decision Tree](images/install_decision_tree.png)
 
@@ -55,14 +55,14 @@ available from the
 of the Omniperf development site. From there, untar and descend into
 the top-level directory as follows:
 
-```shell
+```shell-session
 $ tar xfz omniperf-v{__VERSION__}.tar.gz
 $ cd omniperf-v{__VERSION__}
 ```
 
 Next, install Python dependencies and complete the Omniperf configuration/install process as follows:
 
-```shell
+```shell-session
 # define top-level install path
 $ export INSTALL_DIR=<your-top-level-desired-install-path>
 
@@ -87,7 +87,7 @@ do not have write access to the chosen install path.
 
 
 After completing these steps, a successful top-level installation directory looks as follows:
-```shell
+```shell-session
 $ ls $INSTALL_DIR
 modulefiles  {__VERSION__}  python-libs
 ```
@@ -102,7 +102,7 @@ follows:
 
 
 
-```shell
+```shell-session
 $ module use $INSTALL_DIR/modulefiles
 $ module load omniperf
 $ which omniperf
@@ -125,14 +125,14 @@ To use Omniperf without the companion modulefile, update your `PATH`
 settings to enable access to the command-line binary. If you installed Python
 dependencies in a shared location, update your `PYTHONPATH` config as well:
 
-```shell
+```shell-session
 export PATH=$INSTALL_DIR/{__VERSION__}/bin:$PATH
 export PYTHONPATH=$INSTALL_DIR/python-libs
 ```
 
 ### rocProf
 
-Omniperf relies on a rocprof binary during the profiling
+Omniperf relies on a rocProf binary during the profiling
 process. Normally the path to this binary will be detected
 automatically, but it can also be overridden via the setting the
 optional `ROCPROF` environment variable to the path of the binary the user
@@ -162,9 +162,9 @@ Omniperf server-side requires the following basic software dependencies prior to
 
 The recommended process for enabling the server-side of Omniperf is to use the provided Docker file to build the Grafana and MongoDB instance.
 
-Once you've decided which machine you'd like to use to host the Grafana and MongoDB instance, please follow the set up instructions below.
+Once you have decided which machine you would like to use to host the Grafana and MongoDB instance, please follow the set up instructions below.
 
-### 1) Install MongoDB Utils
+### Install MongoDB Utils
 Omniperf uses [mongoimport](https://www.mongodb.com/docs/database-tools/mongoimport/) to upload data to Grafana's backend database. Install for Ubuntu 20.04 is as follows:
 
 ```bash 
@@ -173,7 +173,7 @@ $ sudo apt install ./mongodb-database-tools-ubuntu2004-x86_64-100.6.1.deb
 ```
 > Installation instructions for alternative distributions can be found [here](https://www.mongodb.com/download-center/database-tools/releases/archive)
 
-### 2) Persistent Storage
+### Persistent Storage
 
 The user will also bind MongoDB to a directory on the host OS to create a local backup in case of a crash or reset. In the Docker world, this is known as "creating a persistent volume":
 
@@ -184,24 +184,24 @@ $ sudo docker volume create --driver local --opt type=none --opt device=/usr/loc
 $ sudo docker volume create --driver local --opt type=none --opt device=/usr/local/persist/mongodb --opt o=bind grafana-mongo-db
 ```
 
-### 3) Build and Launch
+### Build and Launch
 
-We're now ready to build our Docker file. Navigate to your Omniperf install directory to begin.
+We are now ready to build our Docker file. Navigate to your Omniperf install directory to begin.
 ```bash
 $ sudo docker-compose build
 $ sudo docker-compose up -d
 ```
 > Note that TCP ports for Grafana (4000) and MongoDB (27017) in the docker container are mapped to 14000 and 27018, respectively, on the host side.
 
-### 4) Setup Grafana Instance
-Once you've launced your docker container you should be able to reach Grafana at **http://\<host-ip>:14000**. The default login credentials for the first-time Grafana setup are:
+### Setup Grafana Instance
+Once you have launched your docker container you should be able to reach Grafana at **http://\<host-ip>:14000**. The default login credentials for the first-time Grafana setup are:
 
 - Username: **admin**
 - Password: **admin**
 
 ![Grafana Welcome Page](images/grafana_welcome.png)
 
-MongoDB Datasource Configuration
+### MongoDB Datasource Configuration
 
 The MongoDB Datasource must be configured prior to the first-time use. Navigate to Grafana's Configuration page (shown below) to add the **Omniperf Data** connection.
 
@@ -219,7 +219,7 @@ After properly configuring these fields click **Save & Test** (as shown below) t
 
 ![Datasource Settings](images/datasource_settings.jpg)
 
-Omniperf Dashboard Import
+### Omniperf Dashboard Import
 
 From *Create* → *Import*, (as shown below) upload the dashboard file, `/dashboards/Omniperf_v{__VERSION__}_pub.json`, from the Omniperf tarball.
 
@@ -227,17 +227,17 @@ Edit both the Dashboard Name and the Unique Identifier (UID) to uniquely identif
 
 ![Import Dashboard](images/import_dashboard.png)
 
-Using your dashboard
+### Using your dashboard
 
-Once you've imported a dashboard you're ready to begin! Start by browsing availible dashboards and selecting the dashboard you've just imported.
+Once you have imported a dashboard you are ready to begin! Start by browsing available dashboards and selecting the dashboard you have just imported.
 
 ![Opening your dashboard](images/opening_dashboard.png)
 
-Remeber, you'll need to upload workload data to the DB backend before analyzing in your Grafana interface. We provide a detailed example of this in our [Analysis section](./analysis.md#grafana-gui-import).
+Remeber, you will need to upload workload data to the DB backend before analyzing in your Grafana interface. We provide a detailed example of this in our [Analysis section](./analysis.md#grafana-gui-import).
 
 After a workload has been successfully uploaded, you should be able to select it from the workload dropdown located at the top of your Grafana dashboard.
 
 ![Selecting Grafana workload](images/grafana_workload_selection.png)
 
-For more information on how to use the Grafana interface for anlysis please see the [Grafana section](./analysis.md#grafana-based-gui) in the Analyze Mode tab.
+For more information on how to use the Grafana interface for analysis please see the [Grafana section](./analysis.md#grafana-based-gui) in the Analyze Mode tab.
 
diff --git a/src/docs/introduction.md b/src/docs/introduction.md
index f0e3864d1..6e595b926 100644
--- a/src/docs/introduction.md
+++ b/src/docs/introduction.md
@@ -6,15 +6,19 @@
    :maxdepth: 4
 ```
 
+This documentation was created to provide a detailed breakdown of all facets of Omniperf. In addition to a full deployment guide with installation instructions, we also explain the design of the tool and each of its components. If you are new to Omniperf, these chapters can be followed in order to gradually acquaint you with the tool and progressively introduce its more advanced features.
+
+This project is proudly open source, and we welcome all feedback! For more details on how to contribute, please see our Contribution Guide.
+
 [Browse Omniperf source code on Github](https://github.com/AMDResearch/omniperf)
 
-## Scope
+## What is Omniperf
 
-MI Performance Profiler ([Omniperf](https://github.com/AMDResearch/omniperf)) is a system performance profiling tool for Machine Learning/HPC workloads running on AMD Instinct (tm) Accelerators. It is currently built on top of the [rocProfiler](https://rocm.docs.amd.com/projects/rocprofiler/en/latest/rocprof.html) to monitor hardware performance counters. The Omniperf tool primarily targets accelerators in the MI100 and MI200 families. Development is in progress to support MI300 and Radeon (tm) RDNA (tm) GPUs.
+Omniperf is a kernel level profiling tool for Machine Learning/HPC workloads running on AMD Instinct (tm) MI accelerators. AMD's Instinct (tm) MI accelerators are Data Center GPUs designed for compute and with some graphics functions disabled or removed. Omniperf is currently built on top of [rocProf](https://rocm.docs.amd.com/projects/rocprofiler/en/latest/rocprof.html) to monitor hardware performance counters. The Omniperf tool primarily targets accelerators in the MI100 and MI200 families. Development is in progress to support AMD Instinct (tm) MI300 and Radeon (tm) RDNA (tm) GPUs.
 
 ## Features
 
-The Omniperf tool performs system profiling based on all available hardware counters for the target accelerator. It provides high level performance analysis features including System Speed-of-Light, IP block Speed-of-Light, Memory Chart Analysis, Roofline Analysis, Baseline Comparisons, and more...
+The Omniperf tool performs profiling based on all available hardware counters for the target accelerator. It provides high level performance analysis features including System Speed-of-Light, Hardware block level Speed-of-Light, Memory Chart Analysis, Roofline Analysis, Baseline Comparisons, and more...
   
 Both command line analysis and GUI analysis are supported. 
 
@@ -32,25 +36,25 @@ Detailed Feature List:
 - System Speed-of-Light Panel
 - Kernel Statistic Panel
 - Memory Chart Analysis Panel
-- Roofline Analysis Panel (*Supported on MI200 only, SLES 15 SP3 or RHEL8*)
+- Roofline Analysis Panel (*Supported on MI200 only, Ubuntu 20.04, SLES 15 SP3 or RHEL8*)
 - Command Processor (CP) Panel
-- Shader Processing Input (SPI) Panel
+- Workgroup Manager (SPI) Panel
 - Wavefront Launch Panel
 - Compute Unit - Instruction Mix Panel
 - Compute Unit - Pipeline Panel
 - Local Data Share (LDS) Panel
 - Instruction Cache Panel
 - Scalar L1D Cache Panel
-- Texture Addresser and Data Panel
+- L1 Address Processing Unit, a.k.a. Texture Addresser (TA) / L1 Backend Data Processing Unit, a.k.a. Texture Data (TD) panel(s)
 - Vector L1D Cache Panel
 - L2 Cache Panel
 - L2 Cache (per-Channel) Panel
 
-## Compatible SOCs
+## Compatible SoCs
 
 | Platform | Status         |
 | :------- | :------------- |
-| Vega 20 (MI-50/60)  | No  |
+| Vega 20 (MI50/60)  | No  |
 | MI100    | Supported      |
 | MI200    | Supported      |
 | MI300    | In development |
diff --git a/src/docs/performance_model.md b/src/docs/performance_model.md
new file mode 100644
index 000000000..59b685144
--- /dev/null
+++ b/src/docs/performance_model.md
@@ -0,0 +1,4408 @@
+# AMD Instinct(tm) MI Series Accelerator Performance Model
+
+```eval_rst
+.. sectionauthor:: Nicholas Curtis <nicholas.curtis@amd.com>
+```
+
+Omniperf makes available an extensive list of metrics to better understand achieved application performance on AMD Instinct(tm) MI accelerators including Graphics Core Next (GCN) GPUs such as the AMD Instinct MI50, CDNA(tm) accelerators such as the MI100, and CDNA(tm) 2 accelerators such as MI250X/250/210.
+
+To best utilize this profiling data, it is vital to understand the role of various hardware blocks of AMD Instinct accelerators. This section aims to describe each hardware block on the accelerator as interacted with by a software developer, and give a deeper understanding of the metrics reported therein.  Refer to [Profiling with Omniperf by Example](profiling-with-omniperf) for more practical examples and detail on how to use Omniperf to optimize your code.
+
+(2xxnote)=
+```{note}
+In this document, we use `MI2XX` to refer to any of the AMD Instinct(tm) MI250X, MI250, and MI210 CDNA2 accelerators interchangeably for situations where the exact product in question is not relevant.
+For more details on the differences between these accelerators, we refer the reader to the [MI250X](https://www.amd.com/en/products/server-accelerators/instinct-mi250x), [MI250](https://www.amd.com/en/products/server-accelerators/instinct-mi250) and [MI210](https://www.amd.com/en/products/server-accelerators/amd-instinct-mi210) product pages.
+```
+
+
+(CU)=
+## Compute Unit (CU)
+
+The Compute Unit (CU) is responsible for executing a user's kernels on AMD's CDNA(tm) accelerators. All [wavefronts](wavefront) of a [workgroup](workgroup) are scheduled on the same CU.
+
+![GCN Compute Unit](images/gcn_compute_unit.png)
+
+The CU consists of several independent pipelines / functional units:
+
+- The vector arithmetic logic unit (VALU) is composed of multiple Single Instruction Multiple Data (SIMD) vector processors, Vector General Purpose Registers (VGPRs) and instruction buffers.  The VALU is responsible for executing much of the computational work on CDNA accelerators, including (but not limited to) floating-point operations (FLOPs), integer operations (IOPs), etc.
+- The vector memory (VMEM) unit is responsible for issuing loads, stores and atomic operations that interact with the memory system.
+- The Scalar Arithmetic Logic Unit (SALU) is shared by all threads in a [wavefront](wavefront), and is responsible for executing instructions that are known to be uniform across the wavefront at compile-time.  The SALU has a memory unit (SMEM) for interacting with memory, but it cannot issue separately from the SALU.
+- The Local Data Share (LDS) is an on-CU software-managed scratchpad memory that can be used to efficiently share data between all threads in a [workgroup](workgroup).
+- The scheduler is responsible for issuing and decoding instructions for all the [wavefronts](wavefront) on the compute unit.
+- The vector L1 data cache (vL1D) is the first level cache local to the compute unit.  On current CDNA accelerators, the vL1D is write-through. The vL1D caches from multiple compute units are kept coherent with one another through software instructions.
+- CDNA accelerators --- i.e., the MI100 and newer --- contain specialized matrix-multiplication accelerator pipelines known as the [Matrix Fused Multiply-Add (MFMA)](mfma).
+
+For a more thorough description of a compute unit on a CDNA accelerator, see [An introduction to AMD GPU
+Programming with HIP](https://www.olcf.ornl.gov/wp-content/uploads/2019/09/AMD_GPU_HIP_training_20190906.pdf), specifically slides 22-28, and [Layla Mah's: The AMD GCN Architecture - A Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah), slide 27.
+
+The [Pipeline Descriptions section](ERD) details the various execution pipelines (VALU, SALU, LDS, Scheduler, etc.).
+The metrics presented by Omniperf for these pipelines are described in [Pipeline Metrics section](ERM).
+Finally, the [vL1D](vL1D) cache and [LDS](LDS) will be described their own sections.
+
+
+(ERD)=
+### Pipeline Descriptions
+
+(valu)=
+#### Vector Arithmetic Logic Unit (VALU)
+
+The vector arithmetic logic unit (VALU) executes vector instructions over an entire wavefront, each [work-item](Workitem) (or, vector-lane) potentially operating on distinct data.
+The VALU of a CDNA accelerator or GCN GPU typically consists of:
+
+- four 16-wide SIMD processors (see [An introduction to AMD GPU
+Programming with HIP](https://www.olcf.ornl.gov/wp-content/uploads/2019/09/AMD_GPU_HIP_training_20190906.pdf) for more details)
+- four 64 or 128 KiB VGPR files (yielding a total of 256-512 KiB total per CU), see [AGPRs](agprs) for more detail.
+- An instruction buffer (per-SIMD) that contains execution slots for up to 8 wavefronts (for 32 total wavefront slots on each CU).
+- A vector memory (VMEM) unit which transfers data between VGPRs and memory; each work-item supplies its own memory address and supplies or receives unique data.
+- CDNA accelerators, such as the MI100 and [MI2XX](2xxnote), contain additional [Matrix Fused Multiply-Add (MFMA) units](https://gpuopen.com/learn/amd-lab-notes/amd-lab-notes-matrix-cores-readme/).
+
+In order to support branching / conditionals, each wavefront in the VALU has a distinct execution mask which determines which work-items in the wavefront are active for the currently executing instruction.
+When executing a VALU instruction, inactive work-items (according to the current execution mask of the wavefront) do not execute the instruction and are treated as no-ops.
+
+```{note}
+On GCN GPUs and the CDNA MI100 accelerator, there are slots for up to 10 wavefronts in the instruction buffer, but generally occupancy is limited by other factors to 32 waves per [Compute Unit](CU).
+On the CDNA2 [MI2XX](2xxnote) series accelerators, there are only 8 waveslots per-SIMD.
+```
+
+(salu)=
+#### Scalar Arithmetic Logic Unit (SALU)
+
+The scalar arithmetic logic unit (SALU) executes instructions that are shared between all work-items in a wavefront.  This includes control-flow -- such as if/else conditionals, branches and looping -- pointer arithmetic, loading common values, etc.
+The SALU consists of:
+
+- a scalar processor capable of various arithmetic, conditional, and comparison (etc.) operations.  See, e.g., [Chapter 5. Scalar ALU Operations](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf) of the CDNA2 Instruction Set Architecture (ISA) Guide for more detail.
+- a 12.5 KiB Scalar General Purpose Register (SGPR) file
+- a scalar memory (SMEM) unit which transfers data between SGPRs and memory
+
+Data loaded by the SMEM can be cached in the [scalar L1 data cache](sL1D), and is typically only used for read-only, uniform accesses such as kernel arguments, or HIP's `__constant__` memory.
+
+(lds)=
+#### Local Data Share (LDS)
+
+The local data share (LDS, a.k.a., "shared memory") is fast on-CU scratchpad that can be explicitly managed by software to effectively share data and to coordinate between wavefronts in a workgroup.
+
+```{figure} images/lds.*
+:scale: 150 %
+:alt: Performance model of the Local Data Share (LDS) on AMD Instinct(tm) MI accelerators.
+:align: center
+
+Performance model of the Local Data Share (LDS) on AMD Instinct(tm) MI accelerators.
+```
+
+Above is Omniperf's performance model of the LDS on CDNA accelerators (adapted from [GCN Architecture, by Mike Mantor](https://old.hotchips.org/wp-content/uploads/hc_archives/hc24/HC24-3-ManyCore/HC24.28.315-AMD.GCN.mantor_v1.pdf), slide 20).
+The SIMDs in the [VALU](valu) are connected to the LDS in pairs (see above).
+Only one SIMD per pair may issue an LDS instruction at a time, but both pairs may issue concurrently.
+
+On CDNA accelerators, the LDS contains 32 banks and each bank is 4B wide.
+The LDS is designed such that each bank can be read from/written to/atomically updated every cycle, for a total throughput of 128B/clock ([GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah), slide 40). 
+
+On each of the two ports to the SIMDs, 64B can be sent in each direction per cycle. So, a single wavefront, coming from one of the 2 SIMDs in a pair, can only get back 64B/cycle (16 lanes per cycle). The input port is shared between data and address and this can affect achieved bandwidth for different data sizes. For example, a 64-wide store where each lane is sending a 4B value takes 8 cycles (50% peak bandwidth) while a 64-wide store where each lane is sending a 16B value takes 20 cycles (80% peak bandwidth).
+
+In addition, the LDS contains conflict-resolution hardware to detect and handle bank conflicts.
+A bank conflict occurs when two (or more) work-items in a wavefront want to read, write, or atomically update different addresses that map to the same bank in the same cycle.
+In this case, the conflict detection hardware will determine a new schedule such that the access is split into multiple cycles with no conflicts in any single cycle.
+
+When multiple work-items want to read from the same address within a bank, the result can be efficiently broadcasted ([GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah), slide 41).
+Multiple work-items writing to the same address within a bank typically results undefined behavior in HIP and other languages, as the LDS will write the value from the last work-item as determined by the hardware scheduler ([GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah), slide 41).  This behavior may be useful in the very specific case of storing a uniform value.
+
+Relatedly, an address conflict is defined as occurring when two (or more) work-items in a wavefront want to atomically update the same address on the same cycle.
+As in a bank-conflict, this may cause additional cycles of work for the LDS operation to complete.
+
+(branch)=
+#### Branch
+
+The branch unit is responsible for executing jumps and branches to execute control-flow operations.
+Note that Branch operations are not used for execution mask updates, but only for “whole wavefront” control-flow changes.
+
+(scheduler)=
+#### Scheduler
+
+The scheduler is responsible for arbitration and issue of instructions for all the wavefronts currently executing on the CU.  On every clock cycle, the scheduler:
+
+- considers waves from one of the SIMD units for execution, selected in a round-robin fashion between the SIMDs in the [compute unit](CU)
+- issues up to one instruction per wavefront on the selected SIMD
+- issues up to one instruction per each of the instruction categories among the waves on the selected SIMD:
+  - [VALU](valu)
+  - [VMEM](valu) operations
+  - [SALU](salu) / SMEM operations
+  - [LDS](lds)
+  - [Branch](branch) operations
+
+This gives a maximum of five issued Instructions Per Cycle (IPC), per-SIMD, per-CU ([AMD GPU HIP Training](https://www.olcf.ornl.gov/wp-content/uploads/2019/09/AMD_GPU_HIP_training_20190906.pdf), [GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah)).
+
+On CDNA accelerators with [MFMA](mfma) instructions, these are issued via the [VALU](valu). Some of them will execute on a separate functional unit and typically allow other [VALU](valu) operations to execute in their shadow (see the [MFMA](mfma) section for more detail).
+
+```{note}
+The IPC model used by Omniperf omits the following two complications for clarity.
+First, CDNA accelerators contain other execution units on the CU that are unused for compute applications.
+Second, so-called "internal" instructions (see [Layla Mah's GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah), slide 29) are not issued to a functional unit, and can technically cause the maximum IPC to _exceed_ 5 instructions per-cycle in special (largely unrealistic) cases.
+The latter issue is discussed in more detail in our ['internal' IPC](Internal_ipc) example.
+```
+
+(mfma)=
+#### Matrix Fused Multiply-Add (MFMA)
+
+CDNA accelerators, such as the MI100 and [MI2XX](2xxnote), contain specialized hardware to accelerate matrix-matrix multiplications, also known as Matrix Fused Multiply-Add (MFMA) operations.
+The exact operation types and supported formats may vary by accelerator.
+The reader is referred to the [AMD matrix cores](https://gpuopen.com/learn/amd-lab-notes/amd-lab-notes-matrix-cores-readme/) blog post on GPUOpen for a general discussion of these hardware units.
+In addition, to explore the available MFMA instructions in-depth on various AMD accelerators (including the CDNA line), we recommend the [AMD Matrix Instruction Calculator](https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator).
+
+```{code-block} shell-session
+:name: matrix_calc_ex
+:caption: Partial snapshot of the AMD Matrix Instruction Calculator Tool
+
+$ ./matrix_calculator.py --architecture cdna2 --instruction v_mfma_f32_4x4x1f32 --detail-instruction
+Architecture: CDNA2
+Instruction: V_MFMA_F32_4X4X1F32
+    Encoding: VOP3P-MAI
+    VOP3P Opcode: 0x42
+    VOP3P-MAI Opcode: 0x2
+    Matrix Dimensions:
+        M: 4
+        N: 4
+        K: 1
+        blocks: 16
+    Execution statistics:
+        FLOPs: 512
+        Execution cycles: 8
+        FLOPs/CU/cycle: 256
+        Can co-execute with VALU: True
+        VALU co-execution cycles possible: 4
+    Register usage:
+        GPRs required for A: 1
+        GPRs required for B: 1
+        GPRs required for C: 4
+        GPRs required for D: 4
+        GPR alignment requirement: 8 bytes
+```
+
+For the purposes of Omniperf, the MFMA unit is typically treated as a separate pipeline from the [VALU](valu), as other VALU instructions (along with other execution pipelines such as the SALU) can be issued during a portion of the total duration of an MFMA operation.
+
+```{note}
+The exact details of VALU and MFMA operation co-execution vary by instruction, and can be explored in more detail via the:
+  - 'Can co-execute with VALU'
+  - 'VALU co-execution cycles possible'
+
+fields in the [AMD Matrix Instruction Calculator](https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator#example-of-querying-instruction-information)'s detailed instruction information.
+```
+
+#### Non-pipeline resources
+
+In this section, we describe a few resources that are not standalone pipelines but are important for understanding performance optimization on CDNA accelerators.
+
+(barrier)=
+##### Barrier
+
+Barriers are resources on the compute-unit of a CDNA accelerator that are used to implement synchronization primitives (e.g., HIP's `__syncthreads`).
+Barriers are allocated to any workgroup that consists of more than a single wavefront.
+
+(agprs)=
+##### Accumulation vector General-Purpose Registers (AGPRs)
+
+Accumulation vector General-Purpose Registers, or AGPRs, are special resources that are accessible to a subset of instructions focused on [MFMA](mfma) operations.
+These registers allow the [MFMA](mfma) unit to access more than the normal maximum of 256 [architected Vector General-Purpose Registers (i.e., VGPRs)](valu) by having up to 256 in the architected space and up to 256 in the accumulation space.
+Traditional VALU instructions can only use VGPRs in the architected space, and data can be moved to/from VGPRs↔AGPRs using specialized instructions (`v_accvgpr_*`).
+These data movement instructions may be used by the compiler to implement lower-cost register-spill/fills on architectures with AGPRs.
+
+AGPRs are not available on all AMD Instinct(tm) accelerators.
+GCN GPUs, such as the AMD Instinct(tm) MI50 had a 256 KiB VGPR file.
+The AMD Instinct(tm) MI100 (CDNA) has a 2x256 KiB register file, where one half is available as general-purpose VGPRs, and the other half is for matrix math accumulation VGPRs (AGPRs).
+The AMD Instinct(tm) [MI2XX](2xxnote) (CDNA2) has a 512 KiB VGPR file per CU, where each wave can dynamically request up to 256 KiB of VGPRs and an additional 256 KiB of AGPRs.
+For more detail, the reader is referred to the [following comment](https://github.com/RadeonOpenCompute/ROCm/issues/1689#issuecomment-1553751913).
+
+(ERM)=
+### Pipeline Metrics
+
+In this section, we describe the metrics available in Omniperf to analyze the pipelines discussed in the [previous section](ERD).
+
+#### Wavefront
+
+(Wavefront_launch_stats)=
+##### Wavefront Launch Stats
+
+The wavefront launch stats panel gives general information about the kernel launch:
+
+```{list-table}
+:header-rows: 1
+:widths: 20 65 15
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Grid Size
+  - The total number of work-items (a.k.a "threads") launched as a part of the kernel dispatch.  In HIP, this is equivalent to the total grid size multiplied by the total workgroup (a.k.a "block") size.
+  - [Work-items](Workitem)
+* - Workgroup Size
+  - The total number of work-items (a.k.a "threads") in each workgroup (a.k.a "block") launched as part of the kernel dispatch.  In HIP, this is equivalent to the total block size.
+  - [Work-items](Workitem)
+* - Total Wavefronts
+  - The total number of wavefronts launched as part of the kernel dispatch.  On AMD Instinct(tm) CDNA accelerators and GCN GPUs, the wavefront size is always 64 work-items.  Thus, the total number of wavefronts should be equivalent to the ceiling of Grid Size divided by 64.
+  - [Wavefronts](Wavefront)
+* - Saved Wavefronts
+  - The total number of wavefronts saved at a context-save, see [cwsr_enable](https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr).
+  - [Wavefronts](Wavefront)
+* - Restored Wavefronts
+  - The total number of wavefronts restored from a context-save, see [cwsr_enable](https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr).
+  - [Wavefronts](Wavefront)
+* - VGPRs
+  - The number of architected vector general-purpose registers allocated for the kernel, see [VALU](valu).  Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity.
+  - [VGPRs](valu)
+* - AGPRs
+  - The number of accumulation vector general-purpose registers allocated for the kernel, see [AGPRs](agprs).  Note: this may not exactly match the number of AGPRs requested by the compiler due to allocation granularity.
+  - [AGPRs](agprs)
+* - SGPRs
+  - The number of scalar general-purpose registers allocated for the kernel, see [SALU](salu).  Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity.
+  - [SGPRs](salu)
+* - LDS Allocation
+  - The number of bytes of [LDS](lds) memory (a.k.a., "Shared" memory) allocated for this kernel.  Note: This may also be larger than what was requested at compile-time due to both allocation granularity and dynamic per-dispatch LDS allocations.
+  - Bytes per [workgroup](workgroup)
+* - Scratch Allocation
+  - The number of bytes of [scratch-memory](Mspace) requested _per_ work-item for this kernel.  Scratch memory is used for stack memory on the accelerator, as well as for register spills/restores.
+  - Bytes per [work-item](workitem)
+```
+
+(Wavefront_runtime_stats)=
+##### Wavefront Runtime Stats
+
+The wavefront runtime statistics gives a high-level overview of the execution of wavefronts in a kernel:
+
+```{list-table}
+:header-rows: 1
+:widths: 18 65 17 
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - [Kernel Time](KernelTime)
+  - The total duration of the executed kernel.  Note: this should not be directly compared to the wavefront cycles / timings below.
+  - Nanoseconds
+* - [Kernel Cycles](KernelCycles)
+  - The total duration of the executed kernel in cycles.  Note: this should not be directly compared to the wavefront cycles / timings below.
+  - Cycles
+* - Instructions per wavefront
+  - The average number of instructions (of all types) executed per wavefront.  This is averaged over all wavefronts in a kernel dispatch.
+  - Instructions / wavefront
+* - Wave Cycles
+  - The number of cycles a wavefront in the kernel dispatch spent resident on a compute unit per [normalization-unit](normunit).  This is averaged over all wavefronts in a kernel dispatch.  Note: this should not be directly compared to the kernel cycles above.
+  - Cycles per [normalization-unit](normunit)
+* - Dependency Wait Cycles
+  - The number of cycles a wavefront in the kernel dispatch stalled waiting on memory of any kind (e.g., instruction fetch, vector or scalar memory, etc.) per [normalization-unit](normunit).  This counter is incremented at every cycle by _all_ wavefronts on a CU stalled at a memory operation.  As such, it is most useful to get a sense of how waves were spending their time, rather than identification of a precise limiter because another wave could be actively executing while a wave is stalled.  The sum of this metric, Issue Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
+  - Cycles per [normalization-unit](normunit)
+* - Issue Wait Cycles
+  - The number of cycles a wavefront in the kernel dispatch was unable to issue an instruction for any reason (e.g., execution pipe back-pressure, arbitration loss, etc.) per [normalization-unit](normunit).  This counter is incremented at every cycle by _all_ wavefronts on a CU unable to issue an instruction.  As such, it is most useful to get a sense of how waves were spending their time, rather than identification of a precise limiter because another wave could be actively executing while a wave is issue stalled.  The sum of this metric, Dependency Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
+  - Cycles per [normalization-unit](normunit)
+* - Active Cycles
+  - The average number of cycles a wavefront in the kernel dispatch was actively executing instructions per [normalization-unit](normunit).  This measurement is made on a per-wavefront basis, and may include (e.g.,) cycles that another wavefront spent actively executing (e.g., on another execution unit) or was stalled.  As such, it is most useful to get a sense of how waves were spending their time, rather than identification of a precise limiter.  The sum of this metric, Issue Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles metric.
+  - Cycles per [normalization-unit](normunit)
+* - Wavefront Occupancy
+  - The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (<< 1ms).
+  - Wavefronts
+```
+
+```{seealso}
+As mentioned above, the measurement of kernel cycles and time typically cannot directly be compared to e.g., Wave Cycles.
+This is due to two factors: first, the kernel cycles/timings are measured using a counter that is impacted by scheduling overhead, this is particularly noticeable for "short-running" kernels (typically << 1ms) where scheduling overhead forms a significant portion of the overall kernel runtime.
+Secondly, the Wave Cycles metric is incremented per-wavefront scheduled to a SIMD every cycle whereas the kernel cycles counter is incremented only once per-cycle when _any_ wavefront is scheduled.
+```
+
+(Inst_mix)=
+#### Instruction Mix
+
+The instruction mix panel shows a breakdown of the various types of instructions executed by the user's kernel, and which pipelines on the [CU](CU) they were executed on.
+In addition, Omniperf reports further information about the breakdown of operation types for the [VALU](valu), vector-memory, and [MFMA](mfma) instructions.
+
+```{note}
+All metrics in this section count _instructions issued_, and _not_ the total number of operations executed.
+The values reported by these metrics will not change regardless of the execution mask of the wavefront.
+We note that even if the execution mask is identically zero (i.e., _no lanes are active_) the instruction will still be counted, as CDNA accelerators still consider these instructions 'issued' see, e.g., [EXECute Mask, Section 3.3 of the CDNA2 ISA Guide](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf) for more details.
+```
+
+##### Overall Instruction Mix
+
+This panel shows the total number of each type of instruction issued to the [various compute pipelines](ERD) on the [CU](CU).
+These are:
+
+```{list-table}
+:header-rows: 1
+:widths: 20 65 15
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - [VALU](valu) Instructions
+  - The total number of vector arithmetic logic unit (VALU) operations issued. These are the workhorses of the compute-unit, and are used to execute wide range of instruction types including floating point operations, non-uniform address calculations, transcendental operations, integer operations, shifts, conditional evaluation, etc.
+  - Instructions
+* - VMEM Instructions
+  - The total number of vector memory operations issued.  These include most loads, stores and atomic operations and all accesses to [generic, global, private and texture](Mspace) memory.
+  - Instructions
+* - [LDS](lds) Instructions
+  - The total number of LDS (a.k.a., "shared memory") operations issued.  These include (e.g.,) loads, stores, atomics, and HIP's `__shfl` operations.
+  - Instructions
+* - [MFMA](mfma) Instructions
+  - The total number of matrix fused multiply-add instructions issued.
+  - Instructions
+* - [SALU](salu) Instructions
+  - The total number of scalar arithmetic logic unit (SALU) operations issued. Typically these are used for (e.g.,) address calculations, literal constants, and other operations that are _provably_ uniform across a wavefront.  Although scalar memory (SMEM) operations are issued by the SALU, they are counted separately in this section.
+  - Instructions
+* - SMEM Instructions
+  - The total number of scalar memory (SMEM) operations issued.  These are typically used for loading kernel arguments, base-pointers and loads from HIP's `__constant__` memory.
+  - Instructions
+* - [Branch](branch) Instructions
+  - The total number of branch operations issued.  These typically consist of jump / branch operations and are used to implement control flow.
+  - Instructions
+```
+
+```{note}
+Note, as mentioned in the [Branch](branch) section: branch operations are not used for execution mask updates, but only for "whole wavefront" control-flow changes.
+```
+
+(VALU_Inst_Mix)=
+##### VALU Arithmetic Instruction Mix
+```{warning}
+Not all metrics in this section (e.g., the floating-point instruction breakdowns) are available on CDNA accelerators older than the [MI2XX](2xxnote) series.
+```
+
+This panel details the various types of vector instructions that were issued to the [VALU](valu).
+The metrics in this section do _not_ include [MFMA](mfma) instructions using the same precision, e.g. the "F16-ADD" metric does not include any 16-bit floating point additions executed as part of an MFMA instruction using the same precision.
+
+```{list-table}
+:header-rows: 1
+:widths: 15 65 20
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - INT32
+  - The total number of instructions operating on 32-bit integer operands issued to the VALU per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - INT64
+  - The total number of instructions operating on 64-bit integer operands issued to the VALU per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - F16-ADD
+  - The total number of addition instructions operating on 16-bit floating-point operands issued to the VALU per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - F16-MUL
+  - The total number of multiplication instructions operating on 16-bit floating-point operands issued to the VALU per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - F16-FMA
+  - The total number of fused multiply-add instructions operating on 16-bit floating-point operands issued to the VALU per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - F16-TRANS
+  - The total number of transcendental instructions (e.g., `sqrt`) operating on 16-bit floating-point operands issued to the VALU per [normalization-unit](normunit)
+  - Instructions per [normalization-unit](normunit)
+* - F32-ADD
+  - The total number of addition instructions operating on 32-bit floating-point operands issued to the VALU per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - F32-MUL
+  - The total number of multiplication instructions operating on 32-bit floating-point operands issued to the VALU per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - F32-FMA
+  - The total number of fused multiply-add instructions operating on 32-bit floating-point operands issued to the VALU per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - F32-TRANS
+  - The total number of transcendental instructions (e.g., `sqrt`) operating on 32-bit floating-point operands issued to the VALU per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - F64-ADD
+  - The total number of addition instructions operating on 64-bit floating-point operands issued to the VALU per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - F64-MUL
+  - The total number of multiplication instructions operating on 64-bit floating-point operands issued to the VALU per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - F64-FMA
+  - The total number of fused multiply-add instructions operating on 64-bit floating-point operands issued to the VALU per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - F64-TRANS
+  - The total number of transcendental instructions (e.g., `sqrt`) operating on 64-bit floating-point operands issued to the VALUper [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - Conversion
+  - The total number of type conversion instructions (e.g., converting data to/from F32↔F64) issued to the VALU per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+```
+
+For an example of these counters in action, the reader is referred to the [VALU Arithmetic Instruction Mix example](VALU_inst_mix_example).
+
+##### VMEM Instruction Mix
+
+This section breaks down the types of vector memory (VMEM) instructions that were issued.
+Refer to the [Instruction Counts metrics section](TA_inst) of address-processor frontend of the vL1D cache for a description of these VMEM instructions.
+
+(MFMA_Inst_mix)=
+##### MFMA Instruction Mix
+
+```{warning}
+The metrics in this section are only available on CDNA2 ([MI2XX](2xxnote)) accelerators and newer.
+```
+
+This section details the types of Matrix Fused Multiply-Add ([MFMA](mfma)) instructions that were issued.
+Note that [MFMA](mfma) instructions are classified by the type of input data they operate on, and _not_ the data-type the result is accumulated to.
+
+```{list-table}
+:header-rows: 1
+:widths: 25 60 17
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - MFMA-I8 Instructions
+  - The total number of 8-bit integer [MFMA](mfma) instructions issued per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - MFMA-F16 Instructions
+  - The total number of 16-bit floating point [MFMA](mfma) instructions issued per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - MFMA-BF16 Instructions
+  - The total number of 16-bit brain floating point [MFMA](mfma) instructions issued per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - MFMA-F32 Instructions
+  - The total number of 32-bit floating-point [MFMA](mfma) instructions issued per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - MFMA-F64 Instructions
+  - The total number of 64-bit floating-point [MFMA](mfma) instructions issued per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+```
+
+#### Compute Pipeline
+
+(FLOP_count)=
+##### FLOP counting conventions
+
+Omniperf's conventions for VALU FLOP counting are as follows:
+  - Addition or Multiplication: 1 operation
+  - Transcendentals: 1 operation
+  - Fused Multiply-Add (FMA): 2 operations
+
+Integer operations (IOPs) do not use this convention. They are counted as a single operation regardless of the instruction type.
+
+```{note}
+Packed operations which operate on multiple operands in the same instruction are counted identically to the underlying instruction type.
+For example, the `v_pk_add_f32` instruction on [MI2XX](2xxnote), which performs an add operation on two pairs of aligned 32-bit floating-point operands is counted only as a single addition (i.e., 1 operation).
+```
+
+As discussed in the [Instruction Mix](Inst_Mix) section, the FLOP/IOP metrics in this section do not take into account the execution mask of the operation, and will report the same value even if the execution mask is identically zero.
+
+For example, a FMA instruction operating on 32-bit floating-point operands (e.g., `v_fma_f32` on a [MI2XX](2xxnote) accelerator) would be counted as 128 total FLOPs: 2 operations (due to the instruction type) multiplied by 64 operations (because the wavefront is composed of 64 work-items).
+
+(Compute_SOL)=
+##### Compute Speed-of-Light
+
+```{warning}
+The theoretical maximum throughput for some metrics in this section are currently computed with the maximum achievable clock frequency, as reported by `rocminfo`, for an accelerator.  This may not be realistic for all workloads.
+```
+
+This section reports the number of floating-point and integer operations executed on the [VALU](valu) and [MFMA](mfma) units in various precisions.
+We note that unlike the [VALU instruction mix](VALU_Inst_Mix) and [MFMA instruction mix](MFMA_Inst_mix) sections, the metrics here are reported as FLOPs and IOPs, i.e., the total number of operations executed.
+
+```{list-table}
+:header-rows: 1
+:widths: 20 65 15
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - VALU FLOPs
+  - The total floating-point operations executed per second on the [VALU](valu).  This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from [MFMA](mfma) instructions.
+  - GFLOPs
+* - VALU IOPs
+  - The total integer operations executed per second on the [VALU](valu).  This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from [MFMA](mfma) instructions.
+  - GIOPs
+* - MFMA FLOPs (BF16)
+  - The total number of 16-bit brain floating point [MFMA](mfma) operations executed per second. Note: this does not include any 16-bit brain floating point operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the specific accelerator.
+  - GFLOPs
+* - MFMA FLOPs (F16)
+  - The total number of 16-bit floating point [MFMA](mfma) operations executed per second. Note: this does not include any 16-bit floating point operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific accelerator.
+  - GFLOPs
+* - MFMA FLOPs (F32)
+  - The total number of 32-bit floating point [MFMA](mfma) operations executed per second. Note: this does not include any 32-bit floating point operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific accelerator.
+  - GFLOPs
+* - MFMA FLOPs (F64)
+  - The total number of 64-bit floating point [MFMA](mfma) operations executed per second. Note: this does not include any 64-bit floating point operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+  - GFLOPs
+* - MFMA IOPs (INT8)
+  - The total number of 8-bit integer [MFMA](mfma) operations executed per second. Note: this does not include any 8-bit integer operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
+  - GIOPs
+```
+
+
+(Pipeline_stats)=
+##### Pipeline Statistics
+
+This section reports a number of key performance characteristics of various execution units on the [CU](cu).
+The reader is referred to the [Instructions per-cycle and Utilizations](IPC_example) example for a detailed dive into these metrics, and the [scheduler](scheduler) for a high-level overview of execution units and instruction issue.
+
+```{list-table}
+:header-rows: 1
+:widths: 20 65 15
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - IPC
+  - The ratio of the total number of instructions executed on the [CU](cu) over the [total active CU cycles](TotalActiveCUCycles).
+  - Instructions per-cycle
+* - IPC (Issued)
+  - The ratio of the total number of (non-[internal](Internal_ipc)) instructions issued over the number of cycles where the [scheduler](scheduler) was actively working on issuing instructions.  The reader is recommended the [Issued IPC](Issued_ipc) example for further detail.
+  - Instructions per-cycle
+* - SALU Utilization
+  - Indicates what percent of the kernel's duration the [SALU](salu) was busy executing instructions.  Computed as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [SALU](salu) / [SMEM](salu) instructions over the [total CU cycles](TotalCUCycles).
+  - Percent
+* - VALU Utilization
+  - Indicates what percent of the kernel's duration the [VALU](valu) was busy executing instructions.  Does not include [VMEM](valu) operations.  Computed as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [VALU](valu) instructions over the [total CU cycles](TotalCUCycles).
+  - Percent
+* - VMEM Utilization
+  - Indicates what percent of the kernel's duration the [VMEM](valu) unit was busy executing instructions, including both global/generic and spill/scratch operations (see the [VMEM instruction count metrics](TA_inst) for more detail).  Does not include [VALU](valu) operations.  Computed as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [VMEM](valu) instructions over the [total CU cycles](TotalCUCycles).
+  - Percent
+* - Branch Utilization
+  - Indicates what percent of the kernel's duration the [Branch](branch) unit was busy executing instructions. Computed as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [Branch](branch) instructions over the [total CU cycles](TotalCUCycles).
+  - Percent
+* - VALU Active Threads
+  - Indicates the average level of [divergence](Divergence) within a wavefront over the lifetime of the kernel. The number of work-items that were active in a wavefront during execution of each [VALU](valu) instruction, time-averaged over all VALU instructions run on all wavefronts in the kernel.
+  - Work-items
+* - MFMA Utilization
+  - Indicates what percent of the kernel's duration the [MFMA](mfma) unit was busy executing instructions.  Computed as the ratio of the total number of cycles spent by the [MFMA](salu) was busy over the [total CU cycles](TotalCUCycles).
+  - Percent
+* - MFMA Instruction Cycles
+  - The average duration of [MFMA](mfma) instructions in this kernel in cycles.  Computed as the ratio of the total number of cycles the [MFMA](mfma) unit was busy over the total number of [MFMA](mfma) instructions.  Compare to e.g., the [AMD Matrix Instruction Calculator](https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator).
+  - Cycles per instruction
+* - VMEM Latency
+  - The average number of round-trip cycles (i.e., from issue to data-return / acknowledgment) required for a VMEM instruction to complete.
+  - Cycles
+* - SMEM Latency
+  - The average number of round-trip cycles (i.e., from issue to data-return / acknowledgment) required for a SMEM instruction to complete.
+  - Cycles
+```
+
+```{note}
+The Branch utilization reported in this section also includes time spent in other instruction types (namely: `s_endpgm`) that are _typically_ a very small percentage of the overall kernel execution.  This complication is omitted for simplicity, but may result in small amounts of "branch" utilization (<<1\%) for otherwise branch-less kernels.
+```
+
+(FLOPS)=
+##### Arithmetic Operations
+
+This section reports the total number of floating-point and integer operations executed in various precisions.
+Unlike the [Compute speed-of-light](Compute_SOL) panel, this section reports both [VALU](valu) and [MFMA](mfma) operations of the same precision (e.g., F32) in the same metric.
+Additionally, this panel lets the user control how the data is normalized (i.e., control the [normalization-unit](normunit)), while the speed-of-light panel does not.
+For more detail on how operations are counted see the [FLOP counting convention](FLOP_count) section.
+
+```{warning}
+As discussed in the [Instruction Mix](Inst_Mix) section, the metrics in this section do not take into account the execution mask of the operation, and will report the same value even if EXEC is identically zero.
+```
+
+```{list-table}
+:header-rows: 1
+:widths: 18 65 17
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - FLOPs (Total)
+  - The total number of floating-point operations executed on either the [VALU](valu) or [MFMA](mfma) units, per [normalization-unit](normunit)
+  - FLOP per [normalization-unit](normunit)
+* - IOPs (Total)
+  - The total number of integer operations executed on either the [VALU](valu) or [MFMA](mfma) units, per [normalization-unit](normunit)
+  - IOP per [normalization-unit](normunit)
+* - F16 OPs
+  - The total number of 16-bit floating-point operations executed on either the [VALU](valu) or [MFMA](mfma) units, per [normalization-unit](normunit)
+  - FLOP per [normalization-unit](normunit)
+* - BF16 OPs
+  - The total number of 16-bit brain floating-point operations executed on either the [VALU](valu) or [MFMA](mfma) units, per [normalization-unit](normunit). Note: on current CDNA accelerators, the [VALU](valu) has no native BF16 instructions.
+  - FLOP per [normalization-unit](normunit)
+* - F32 OPs
+  - The total number of 32-bit floating-point operations executed on either the [VALU](valu) or [MFMA](mfma) units, per [normalization-unit](normunit)
+  - FLOP per [normalization-unit](normunit)
+* - F64 OPs
+  - The total number of 64-bit floating-point operations executed on either the [VALU](valu) or [MFMA](mfma) units, per [normalization-unit](normunit)
+  - FLOP per [normalization-unit](normunit)
+* - INT8 OPs
+  - The total number of 8-bit integer operations executed on either the [VALU](valu) or [MFMA](mfma) units, per [normalization-unit](normunit). Note: on current CDNA accelerators, the [VALU](valu) has no native INT8 instructions.
+  - IOPs per [normalization-unit](normunit)
+```
+
+(LDS_metrics)=
+### Local Data Share (LDS)
+
+#### LDS Speed-of-Light
+
+```{warning}
+The theoretical maximum throughput for some metrics in this section are currently computed with the maximum achievable clock frequency, as reported by `rocminfo`, for an accelerator.  This may not be realistic for all workloads.
+```
+
+The LDS speed-of-light chart shows a number of key metrics for the [LDS](lds) as a comparison with the peak achievable values of those metrics.
+The reader is referred to our previous [LDS](lds) description for a more in-depth view of the hardware.
+
+```{list-table}
+:header-rows: 1
+:widths: 20 65 15
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Utilization
+  - Indicates what percent of the kernel's duration the [LDS](lds) was actively executing instructions (including, but not limited to, load, store, atomic and HIP's `__shfl` operations).  Calculated as the ratio of the total number of cycles LDS was active over the [total CU cycles](TotalCUCycles).
+  - Percent
+* - Access Rate
+  - Indicates the percentage of SIMDs in the [VALU](valu){sup}`1` actively issuing LDS instructions, averaged over the lifetime of the kernel. Calculated as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [LDS](lds) instructions over the [total CU cycles](TotalCUCycles).
+  - Percent
+* - Theoretical Bandwidth (% of Peak)
+  - Indicates the maximum amount of bytes that _could_ have been loaded from/stored to/atomically updated in the LDS in this kernel, as a percent of the peak LDS bandwidth achievable.  See the [LDS Bandwidth example](lds_bandwidth) for more detail.
+  - Percent
+* - Bank Conflict Rate
+  - Indicates the percentage of active LDS cycles that were spent servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing bank conflicts over the number of LDS cycles that would have been required to move the same amount of data in an uncontended access.{sup}`2`
+  - Percent
+```
+
+```{note}
+{sup}`1` Here we assume the typical case where the workload evenly distributes LDS operations over all SIMDs in a CU (that is, waves on different SIMDs are executing similar code).
+For highly unbalanced workloads, where e.g., one SIMD pair in the CU does not issue LDS instructions at all, this metric is better interpreted as the percentage of SIMDs issuing LDS instructions on [SIMD pairs](lds) that are actively using the LDS, averaged over the lifetime of the kernel.
+
+{sup}`2` The maximum value of the bank conflict rate is less than 100% (specifically: 96.875%), as the first cycle in the [LDS scheduler](lds) is never considered contended.
+```
+
+#### Statistics
+
+The [LDS](lds) statistics panel gives a more detailed view of the hardware:
+
+```{list-table}
+:header-rows: 1
+:widths: 18 65 17
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - LDS Instructions
+  - The total number of LDS instructions (including, but not limited to, read/write/atomics, and e.g., HIP's `__shfl` instructions) executed per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - Theoretical Bandwidth
+  - Indicates the maximum amount of bytes that could have been loaded from/stored to/atomically updated in the LDS per [normalization-unit](normunit).  Does _not_ take into account the execution mask of the wavefront when the instruction was executed (see [LDS Bandwidth](lds_bandwidth) example for more detail).
+  - Bytes per [normalization-unit](normunit)
+* - LDS Latency
+  - The average number of round-trip cycles (i.e., from issue to data-return / acknowledgment) required for an LDS instruction to complete.
+  - Cycles
+* - Bank Conflicts/Access
+  - The ratio of the number of cycles spent in the [LDS scheduler](lds) due to bank conflicts (as determined by the conflict resolution hardware) to the base number of cycles that would be spent in the LDS scheduler in a completely uncontended case.  This is the unnormalized form of the Bank Conflict Rate.
+  - Conflicts/Access
+* - Index Accesses
+  - The total number of cycles spent in the [LDS scheduler](lds) over all operations per [normalization-unit](normunit).
+  - Cycles per [normalization-unit](normunit)
+* - Atomic Return Cycles
+  - The total number of cycles spent on LDS atomics with return  per [normalization-unit](normunit).
+  - Cycles per [normalization-unit](normunit)
+* - Bank Conflicts
+  - The total number of cycles spent in the [LDS scheduler](lds) due to bank conflicts (as determined by the conflict resolution hardware) per [normalization-unit](normunit).
+  - Cycles per [normalization-unit](normunit)
+* - Address Conflicts
+  - The total number of cycles spent in the [LDS scheduler](lds) due to address conflicts (as determined by the conflict resolution hardware) per [normalization-unit](normunit).
+  - Cycles per [normalization-unit](normunit)
+* - Unaligned Stall
+  - The total number of cycles spent in the [LDS scheduler](lds) due to stalls from non-dword aligned addresses per [normalization-unit](normunit).
+  - Cycles per [normalization-unit](normunit)
+* - Memory Violations
+  - The total number of out-of-bounds accesses made to the LDS, per [normalization-unit](normunit).  This is unused and expected to be zero in most configurations for modern CDNA accelerators.
+  - Accesses per [normalization-unit](normunit)
+```
+
+
+(vL1D)=
+### Vector L1 Cache (vL1D)
+
+The vector L1 data (vL1D) cache is local to each [compute unit](CU) on the accelerator, and handles vector memory operations issued by a wavefront.
+The vL1D cache consists of several components:
+
+  - an address processing unit, also known as the [texture addresser (TA)](TA), which receives commands (e.g., instructions) and write/atomic data from the [Compute Unit](CU), and coalesces them into fewer requests for the cache to process.
+  - an address translation unit, also known as the L1 Unified Translation Cache (UTCL1), that translates requests from virtual to physical addresses for lookup in the cache.  The translation unit has an L1 translation lookaside buffer (L1TLB) to reduce the cost of repeated translations.
+  - a Tag RAM that looks up whether a requested cache line is already present in the [cache](TC).
+  - the result of the Tag RAM lookup is placed in the L1 cache controller for routing to the correct location, e.g., the [L2 Memory Interface](TCP_TCC_Transactions_Detail) for misses or the [Cache RAM](TC) for hits.
+  - the Cache RAM, also known as the [texture cache (TC)](TC), stores requested data for potential reuse.  Data returned from the [L2 cache](L2) is placed into the Cache RAM before going down the [data-return path](TD).
+  - a backend data processing unit, also known as the [texture data (TD)](TD) that routes data back to the requesting [Compute Unit](CU).
+
+Together, this complex is known as the vL1D, or Texture Cache per Pipe (TCP).
+A simplified diagram of the vL1D is presented below:
+
+```{figure} images/l1perf_model.*
+:scale: 150 %
+:alt: Performance model of the vL1D Cache on AMD Instinct(tm) MI accelerators.
+:align: center
+
+Performance model of the vL1D Cache on AMD Instinct(tm) MI accelerators.
+```
+
+(L1_SOL)=
+#### vL1D Speed-of-Light
+
+```{warning}
+The theoretical maximum throughput for some metrics in this section are currently computed with the maximum achievable clock frequency, as reported by `rocminfo`, for an accelerator.  This may not be realistic for all workloads.
+```
+
+The vL1D's speed-of-light chart shows several key metrics for the vL1D as a comparison with the peak achievable values of those metrics.
+
+```{list-table}
+:header-rows: 1
+:widths: 20 65 15
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Hit Rate
+  - The ratio of the number of vL1D cache line requests that hit{sup}`1` in vL1D cache over the total number of cache line requests to the [vL1D Cache RAM](TC).
+  - Percent
+* - Bandwidth
+  - The number of bytes looked up in the vL1D cache as a result of [VMEM](VALU) instructions, as a percent of the peak theoretical bandwidth achievable on the specific accelerator.  The number of bytes is calculated as the number of cache lines requested multiplied by the cache line size.  This value does not consider partial requests, so e.g., if only a single value is requested in a cache line, the data movement will still be counted as a full cache line.
+  - Percent
+* - Utilization
+  - Indicates how busy the [vL1D Cache RAM](TC) was during the kernel execution. The number of cycles where the [vL1D Cache RAM](TC) is actively processing any request divided by the number of cycles where the [vL1D is active](vL1d_activity){sup}`2`
+  - Percent
+* - Coalescing
+  - Indicates how well memory instructions were coalesced by the [address processing unit](TA), ranging from uncoalesced (25\%) to fully coalesced (100\%). The average number of [thread-requests](ThreadRequests) generated per instruction divided by the ideal number of [thread-requests](ThreadRequests) per instruction.
+  - Percent
+```
+
+(vL1d_activity)=
+```{note}
+{sup}`1` The vL1D cache on AMD Instinct(tm) MI CDNA accelerators uses a "hit-on-miss" approach to reporting cache hits.
+That is, if while satisfying a miss, another request comes in that would hit on the same pending cache line, the subsequent request will be counted as a 'hit'.
+Therefore, it is also important to consider the Access Latency metric in the [Cache access metrics](TCP_cache_access_metrics) section when evaluating the vL1D hit rate.
+
+{sup}`2` Omniperf considers the vL1D to be active when any part of the vL1D (excluding the [address-processor](TA) and [data-return](TD) units) are active, e.g., performing a translation, waiting for data, accessing the Tag or Cache RAMs, etc.
+```
+(TA)=
+#### Address Processing Unit or Texture Addresser (TA)
+
+The [vL1D](vL1D)'s address processing unit receives vector memory instructions (commands) along with write/atomic data from a [Compute Unit](CU) and is responsible for coalescing these into requests for lookup in the [vL1D RAM](TC).
+The address processor passes information about the commands (coalescing state, destination SIMD, etc.) to the [data processing unit](TD) for use after the requested data has been retrieved.
+
+Omniperf reports several metrics to indicate performance bottlenecks in the address processing unit, which are broken down into a few categories:
+
+  - Busy / stall metrics
+  - Instruction counts
+  - Spill / Stack metrics
+
+##### Busy / Stall metrics
+
+When executing vector memory instructions, the compute unit must send an address (and in the case of writes/atomics, data) to the address processing unit.  When the frontend cannot accept any more addresses, it must backpressure the wave-issue logic for the VMEM pipe and prevent the issue of a vector memory instruction until a previously issued memory operation has been processed.
+
+```{list-table}
+:header-rows: 1
+:widths: 20 65 15
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Busy
+  - Percent of the [total CU cycles](TotalCUCycles) the address processor was busy
+  - Percent
+* - Address Stall
+  - Percent of the [total CU cycles](TotalCUCycles) the address processor was stalled from sending address requests further into the vL1D pipeline
+  - Percent
+* - Data Stall
+  - Percent of the [total CU cycles](TotalCUCycles) the address processor was stalled from sending write/atomic data further into the vL1D pipeline
+  - Percent
+* - Data-Processor → Address Stall
+  - Percent of [total CU cycles](TotalCUCycles) the address processor was stalled waiting to send command data to the [data processor](TD)
+  - Percent
+```
+
+
+(TA_inst)=
+##### Instruction counts
+
+The address processor also counts instruction types to give the user information on what sorts of memory instructions were executed by the kernel.
+These are broken down into a few major categories:
+
+```{list-table}
+:header-rows: 1
+:widths: 20 20 60
+:class: noscroll-table
+* - Memory type
+  - Usage
+  - Description
+* - Global
+  - Global memory
+  - Global memory can be seen by all threads from a process.  This includes the local accelerator's DRAM, remote accelerator's DRAM, and the host's DRAM.
+* - Generic
+  - Dynamic address spaces
+  - Generic memory, a.k.a. "flat" memory, is used when the compiler cannot statically prove that a pointer is to memory in one or the other address spaces. The pointer could dynamically point into global, local, constant, or private memory.
+* - Private Memory
+  - Register spills / Stack memory
+  - Private memory, a.k.a. "scratch" memory, is only visible to a particular [work-item](workitem) in a particular [workgroup](workgroup).  On AMD Instinct(tm) MI accelerators, private memory is used to implement both register spills and stack memory accesses.
+```
+
+The address processor counts these instruction types as follows:
+
+```{list-table}
+:header-rows: 1
+:widths: 18 65 17
+:class: noscroll-table
+
+* - Type
+  - Description
+  - Unit
+* - Global/Generic
+  - The total number of global & generic memory instructions executed on all [compute units](CU) on the accelerator, per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - Global/Generic Read
+  - The total number of global & generic memory read instructions executed on all [compute units](CU) on the accelerator, per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - Global/Generic Write
+  - The total number of global & generic memory write instructions executed on all [compute units](CU) on the accelerator, per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - Global/Generic Atomic
+  - The total number of global & generic memory atomic (with and without return) instructions executed on all [compute units](CU) on the accelerator, per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - Spill/Stack
+  - The total number of spill/stack memory instructions executed on all [compute units](CU) on the accelerator, per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - Spill/Stack Read
+  - The total number of spill/stack memory read instructions executed on all [compute units](CU) on the accelerator, per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - Spill/Stack Write
+  - The total number of spill/stack memory write instructions executed on all [compute units](CU) on the accelerator, per [normalization-unit](normunit).
+  - Instruction per [normalization-unit](normunit)
+* - Spill/Stack Atomic
+  - The total number of spill/stack memory atomic (with and without return) instructions executed on all [compute units](CU) on the accelerator, per [normalization-unit](normunit). Typically unused as these memory operations are typically used to implement thread-local storage.
+  - Instructions per [normalization-unit](normunit)
+```
+
+```{note}
+The above is a simplified model specifically for the HIP programming language that does not consider (e.g.,) inline assembly usage, constant memory usage or texture memory.
+
+These categories correspond to:
+  - Global/Generic: global and flat memory operations, that are used for Global and Generic memory access.
+  - Spill/Stack: buffer instructions which are used on the MI50, MI100, and [MI2XX](2xxnote) accelerators for register spills / stack memory.
+
+These concepts are described in more detail in the [memory space section](Mspace) below, while generic memory access is explored in the [generic memory benchmark](flatmembench) section.
+```
+
+##### Spill/Stack metrics
+
+Finally, the address processing unit contains a separate coalescing stage for spill/stack memory, and thus reports:
+
+```{list-table}
+:header-rows: 1
+:widths: 18 65 17
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Spill/Stack Total Cycles
+  - The number of cycles the address processing unit spent working on spill/stack instructions, per [normalization-unit](normunit).
+  - Cycles per [normalization-unit](normunit)
+* - Spill/Stack Coalesced Read Cycles
+  - The number of cycles the address processing unit spent working on coalesced spill/stack read instructions, per [normalization-unit](normunit).
+  - Cycles per [normalization-unit](normunit)
+* - Spill/Stack Coalesced Write Cycles
+  - The number of cycles the address processing unit spent working on coalesced spill/stack write instructions, per [normalization-unit](normunit)
+  - Cycles per [normalization-unit](normunit)
+```
+
+(UTCL1)=
+#### L1 Unified Translation Cache (UTCL1)
+
+After a vector memory instruction has been processed/coalesced by the address processing unit of the vL1D, it must be translated from a virtual to physical address.
+This process is handled by the L1 Unified Translation Cache (UTCL1).
+This cache contains a L1 Translation Lookaside Buffer (TLB) which stores recently translated addresses to reduce the cost of subsequent re-translations.
+
+Omniperf reports the following L1 TLB metrics:
+
+```{list-table}
+:header-rows: 1
+:widths: 18 65 17
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Requests
+  - The number of translation requests made to the UTCL1 per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+* - Hits
+  - The number of translation requests that hit in the UTCL1, and could be reused, per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+* - Hit Ratio
+  - The ratio of the number of translation requests that hit in the UTCL1 divided by the total number of translation requests made to the UTCL1.
+  - Percent
+* - Translation Misses
+  - The total number of translation requests that missed in the UTCL1 due to translation not being present in the cache, per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+* - Permission Misses
+  - The total number of translation requests that missed in the UTCL1 due to a permission error, per [normalization-unit](normunit).  This is unused and expected to be zero in most configurations for modern CDNA accelerators.
+  - Requests per [normalization-unit](normunit)
+```
+```{note}
+On current CDNA accelerators, such as the [MI2XX](2xxnote), the UTCL1 does _not_ count hit-on-miss requests.
+```
+
+(TC)=
+#### Vector L1 Cache RAM (TC)
+
+After coalescing in the [address processing unit](TA) of the v1LD, and address translation in the [L1 TLB](UTCL1) the request proceeds to the Cache RAM stage of the pipeline.
+Incoming requests are looked up in the cache RAMs using parts of the physical address as a tag.
+Hits will be returned through the [data-return path](TD), while misses will routed out to the [L2 Cache](L2) for servicing.
+
+The metrics tracked by the vL1D RAM include:
+
+  - Stall metrics
+  - Cache access metrics
+  - vL1D-L2 transaction detail metrics
+
+(TCP_cache_stall_metrics)=
+##### vL1D cache stall metrics
+
+The vL1D also reports where it is stalled in the pipeline, which may indicate performance limiters of the cache.
+A stall in the pipeline may result in backpressuring earlier parts of the pipeline, e.g., a stall on L2 requests may backpressure the wave-issue logic of the [VMEM](VALU) pipe and prevent it from issuing more vector memory instructions until the vL1D's outstanding requests are completed.
+
+```{list-table}
+:header-rows: 1
+:widths: 20 65 15
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Stalled on L2 Data
+  - The ratio of the number of cycles where the vL1D is stalled waiting for requested data to return from the [L2 cache](L2) divided by the number of cycles where the [vL1D is active](vL1d_activity).
+  - Percent
+* - Stalled on L2 Requests
+  - The ratio of the number of cycles where the vL1D is stalled waiting to issue a request for data to the [L2 cache](L2) divided by the number of cycles where the [vL1D is active](vL1d_activity).
+  - Percent
+* - Tag RAM Stall (Read/Write/Atomic)
+  - The ratio of the number of cycles where the vL1D is stalled due to Read/Write/Atomic requests with conflicting tags being looked up concurrently, divided by the number of cycles where the [vL1D is active](vL1d_activity).
+  - Percent
+```
+
+(TCP_cache_access_metrics)=
+##### vL1D cache access metrics
+
+The vL1D cache access metrics broadly indicate the type of requests incoming from the [cache frontend](TA), the number of requests that were serviced by the vL1D, and the number & type of outgoing requests to the [L2 cache](L2).  In addition, this section includes the approximate latencies of accesses to the cache itself, along with latencies of read/write memory operations to the [L2 cache](L2).
+
+```{list-table}
+:header-rows: 1
+:widths: 18 65 17
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Total Requests
+  - The total number of incoming requests from the [address processing unit](TA) after coalescing.
+  - Requests
+* - Total read/write/atomic requests
+  - The total number of incoming read/write/atomic requests from the [address processing unit](TA) after coalescing per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+* - Cache Bandwidth
+  - The number of bytes looked up in the vL1D cache as a result of [VMEM](VALU) instructions per [normalization-unit](normunit).  The number of bytes is calculated as the number of cache lines requested multiplied by the cache line size.  This value does not consider partial requests, so e.g., if only a single value is requested in a cache line, the data movement will still be counted as a full cache line.
+  - Bytes per [normalization-unit](normunit)
+* - Cache Hit Rate
+  - The ratio of the number of vL1D cache line requests that hit in vL1D cache over the total number of cache line requests to the [vL1D Cache RAM](TC).
+  - Percent
+* - Cache Accesses
+  - The total number of cache line lookups in the vL1D.
+  - Cache lines
+* - Cache Hits
+  - The number of cache accesses minus the number of outgoing requests to the [L2 cache](L2), i.e., the number of cache line requests serviced by the [vL1D Cache RAM](TC) per [normalization-unit](normunit).
+  - Cache lines per [normalization-unit](normunit)
+* - Invalidations
+  - The number of times the vL1D was issued a write-back invalidate command during the kernel's execution per [normalization-unit](normunit).  This may be triggered by, e.g., the `buffer_wbinvl1` instruction.
+  - Invalidations per [normalization-unit](normunit)
+* - L1-L2 Bandwidth
+  - The number of bytes transferred across the vL1D-L2 interface as a result of [VMEM](VALU) instructions, per [normalization-unit](normunit).  The number of bytes is calculated as the number of cache lines requested multiplied by the cache line size.  This value does not consider partial requests, so e.g., if only a single value is requested in a cache line, the data movement will still be counted as a full cache line.
+  - Bytes per [normalization-unit](normunit)
+* - L1-L2 Reads
+  - The number of read requests for a vL1D cache line that were not satisfied by the vL1D and must be retrieved from the to the [L2 Cache](L2) per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+* - L1-L2 Writes
+  - The number of post-coalescing write requests that are sent through the vL1D to the [L2 cache](L2), per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+* - L1-L2 Atomics
+  - The number of atomic requests that are sent through the vL1D to the [L2 cache](L2), per [normalization-unit](normunit). This includes requests for atomics with, and without return.
+  - Requests per [normalization-unit](normunit)
+* - L1 Access Latency
+  - The average number of cycles that a vL1D cache line request spent in the vL1D cache pipeline.
+  - Cycles
+* - L1-L2 Read Access Latency
+  - The average number of cycles that the vL1D cache took to issue and receive read requests from the [L2 Cache](L2).  This number also includes requests for atomics with return values.
+  - Cycles
+* - L1-L2 Write Access Latency
+  - The average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the [L2 Cache](L2).  This number also includes requests for atomics without return values.
+  - Cycles
+```
+
+```{note}
+All cache accesses in vL1D are for a single cache line's worth of data.
+The size of a cache line may vary, however on current AMD Instinct(tm) MI CDNA accelerators and GCN GPUs the L1 cache line size is 64B.
+```
+
+(TCP_TCC_Transactions_Detail)=
+##### vL1D - L2 Transaction Detail
+
+This section provides a more granular look at the types of requests made to the [L2 cache](L2).
+These are broken down by the operation type (read / write / atomic, with, or without return), and the [memory type](Mtype).
+For more detail, the reader is referred to the [Memory Types](Mtype) section.
+
+
+(TD)=
+#### Vector L1 Data-Return Path or Texture Data (TD)
+
+The data-return path of the vL1D cache, also known as the Texture Data (TD) unit, is responsible for routing data returned from the [vL1D cache RAM](TC) back to a wavefront on a SIMD.
+As described in the [vL1D cache front-end](TA) section, the data-return path is passed information about the space requirements and routing for data requests from the [VALU](valu).
+When data is returned from the [vL1D cache RAM](TC), it is matched to this previously stored request data, and returned to the appropriate SIMD.
+
+Omniperf reports the following vL1D data-return path metrics:
+
+```{list-table}
+:header-rows: 1
+:widths: 18 65 17
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Data-return Busy
+  - Percent of the [total CU cycles](TotalCUCycles) the data-return unit was busy processing or waiting on data to return to the [CU](CU).
+  - Percent
+* - Cache RAM → Data-return Stall
+  - Percent of the [total CU cycles](TotalCUCycles) the data-return unit was stalled on data to be returned from the [vL1D Cache RAM](TC).
+  - Percent
+* - Workgroup manager → Data-return Stall
+  - Percent of the [total CU cycles](TotalCUCycles) the data-return unit was stalled by the [workgroup manager](SPI) due to initialization of registers as a part of launching new workgroups.
+  - Percent
+* - Coalescable Instructions
+  - The number of instructions submitted to the [data-return unit](TD) by the [address-processor](TA) that were found to be coalescable, per [normalization-unit](normunit).
+  - Instructions per [normalization-unit](normunit)
+* - Read Instructions
+  - The number of read instructions submitted to the [data-return unit](TD) by the [address-processor](TA) summed over all [compute units](CU) on the accelerator, per [normalization-unit](normunit).  This is expected to be the sum of global/generic and spill/stack reads in the [address processor](TA_inst).
+  - Instructions per [normalization-unit](normunit)
+* - Write Instructions
+  - The number of store instructions submitted to the [data-return unit](TD) by the [address-processor](TA) summed over all [compute units](CU) on the accelerator, per [normalization-unit](normunit).  This is expected to be the sum of global/generic and spill/stack stores counted by the [vL1D cache-frontend](TA_inst).
+  - Instructions per [normalization-unit](normunit)
+* - Atomic Instructions
+  - The number of atomic instructions submitted to the [data-return unit](TD) by the [address-processor](TA) summed over all [compute units](CU) on the accelerator, per [normalization-unit](normunit).  This is expected to be the sum of global/generic and spill/stack atomics in the [address processor](TA_inst).
+  - Instructions per [normalization-unit](normunit)
+```
+
+(L2)=
+## L2 Cache (TCC)
+
+The L2 cache is the coherence point for current AMD Instinct(tm) MI GCN GPUs and CDNA accelerators, and is shared by all [compute units](CU) on the device.
+Besides serving requests from the [vector L1 data caches](vL1D), the L2 cache also is responsible for servicing requests from the [L1 instruction caches](L1I), the [scalar L1 data caches](sL1D) and the [command-processor](CP).
+The L2 cache is composed of a number of distinct channels (32 on MI100/[MI2XX](2xxnote) series CDNA accelerators at 256B address interleaving) which can largely operate independently.
+Mapping of incoming requests to a specific L2 channel is determined by a hashing mechanism that attempts to evenly distribute requests across the L2 channels.
+Requests that miss in the L2 cache are passed out to [Infinity Fabric(tm)](l2fabric) to be routed to the appropriate memory location.
+
+The L2 cache metrics reported by Omniperf are broken down into four categories:
+
+  - L2 Speed-of-Light
+  - L2 Cache Accesses
+  - L2-Fabric Transactions
+  - L2-Fabric Stalls
+
+
+(L2SoL)=
+### L2 Speed-of-Light
+
+```{warning}
+The theoretical maximum throughput for some metrics in this section are currently computed with the maximum achievable clock frequency, as reported by `rocminfo`, for an accelerator.  This may not be realistic for all workloads.
+```
+
+The L2 cache's speed-of-light table contains a few key metrics about the performance of the L2 cache, aggregated over all the L2 channels, as a comparison with the peak achievable values of those metrics:
+
+```{list-table}
+:header-rows: 1
+:widths: 20 65 15
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Utilization
+  - The ratio of the [number of cycles an L2 channel was active, summed over all L2 channels on the accelerator](TotalActiveL2Cycles) over the [total L2 cycles](TotalL2Cycles).
+  - Percent
+* - Bandwidth
+  - The number of bytes looked up in the L2 cache, as a percent of the peak theoretical bandwidth achievable on the specific accelerator.  The number of bytes is calculated as the number of cache lines requested multiplied by the cache line size.  This value does not consider partial requests, so e.g., if only a single value is requested in a cache line, the data movement will still be counted as a full cache line.
+  - Percent
+* - Hit Rate
+  - The ratio of the number of L2 cache line requests that hit in the L2 cache over the total number of incoming cache line requests to the L2 cache.
+  - Percent
+* - L2-Fabric Read BW
+  - The number of bytes read by the L2 over the [Infinity Fabric(tm) interface](l2fabric) per unit time.
+  - GB/s
+* - L2-Fabric Write and Atomic BW
+  - The number of bytes sent by the L2 over the [Infinity Fabric(tm) interface](l2fabric) by write and atomic operations per unit time.
+  - GB/s
+```
+
+```{note}
+The L2 cache on AMD Instinct(tm) MI CDNA accelerators uses a "hit-on-miss" approach to reporting cache hits.
+That is, if while satisfying a miss, another request comes in that would hit on the same pending cache line, the subsequent request will be counted as a 'hit'.
+Therefore, it is also important to consider the latency metric in the [L2-Fabric](l2fabric) section when evaluating the L2 hit rate.
+```
+
+(L2_cache_metrics)=
+### L2 Cache Accesses
+
+This section details the incoming requests to the L2 cache from the [vL1D](vL1D) and other clients (e.g., the [sL1D](sL1D) and [L1I](L1I) caches).
+
+```{list-table}
+:header-rows: 1
+:widths: 13 70 17
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Bandwidth
+  - The number of bytes looked up in the L2 cache, per [normalization-unit](normunit).  The number of bytes is calculated as the number of cache lines requested multiplied by the cache line size.  This value does not consider partial requests, so e.g., if only a single value is requested in a cache line, the data movement will still be counted as a full cache line.
+  - Bytes per [normalization-unit](normunit)
+* - Requests
+  - The total number of incoming requests to the L2 from all clients for all request types, per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+* - Read Requests
+  - The total number of read requests to the L2 from all clients.
+  - Requests per [normalization-unit](normunit)
+* - Write Requests
+  - The total number of write requests to the L2 from all clients.
+  - Requests per [normalization-unit](normunit)
+* - Atomic Requests
+  - The total number of atomic requests (with and without return) to the L2 from all clients.
+  - Requests per [normalization-unit](normunit)
+* - Streaming Requests
+  - The total number of incoming requests to the L2 that are marked as 'streaming'. The exact meaning of this may differ depending on the targeted accelerator, however on an [MI2XX](2xxnote) this corresponds to [non-temporal load or stores](https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins). The L2 cache attempts to evict 'streaming' requests before normal requests when the L2 is at capacity.
+  - Requests per [normalization-unit](normunit)
+* - Probe Requests
+  - The number of coherence probe requests made to the L2 cache from outside the accelerator.  On an [MI2XX](2xxnote), probe requests may be generated by e.g., writes to [fine-grained device](MType) memory or by writes to [coarse-grained](MType) device memory.
+  - Requests per [normalization-unit](normunit)
+* - Hit Rate
+  - The ratio of the number of L2 cache line requests that hit in the L2 cache over the total number of incoming cache line requests to the L2 cache.
+  - Percent
+* - Hits
+  - The total number of requests to the L2 from all clients that hit in the cache.  As noted in the [speed-of-light](L2SoL) section, this includes hit-on-miss requests.
+  - Requests per [normalization-unit](normunit)
+* - Misses
+  - The total number of requests to the L2 from all clients that miss in the cache.  As noted in the [speed-of-light](L2SoL) section, these do not include hit-on-miss requests.
+  - Requests per [normalization-unit](normunit)
+* - Writebacks
+  - The total number of L2 cache lines written back to memory for any reason. Write-backs may occur due to e.g., user-code (e.g., HIP kernel calls to `__threadfence_system`, or atomic built-ins), by the [command-processor](CP)'s memory acquire/release fences, or for other internal hardware reasons.
+  - Cache lines per [normalization-unit](normunit)
+* - Writebacks (Internal)
+  - The total number of L2 cache lines written back to memory for internal hardware reasons, per [normalization-unit](normunit).
+  - Cache lines per [normalization-unit](normunit)
+* - Writebacks (vL1D Req)
+  - The total number of L2 cache lines written back to memory due to requests initiated by the [vL1D cache](vL1D), per [normalization-unit](normunit).
+  - Cache lines per [normalization-unit](normunit)
+* - Evictions (Normal)
+  - The total number of L2 cache lines evicted from the cache due to capacity limits, per [normalization-unit](normunit), per [normalization-unit](normunit).
+  - Cache lines per [normalization-unit](normunit)
+* - Evictions (vL1D Req)
+  - The total number of L2 cache lines evicted from the cache due to invalidation requests initiated by the [vL1D cache](vL1D), per [normalization-unit](normunit).
+  - Cache lines per [normalization-unit](normunit)
+* - Non-hardware-Coherent Requests
+  - The total number of requests to the L2 to Not-hardware-Coherent (NC) memory allocations, per [normalization-unit](normunit).  See the [Memory Types section](Mtype) for more detail.
+  - Requests per [normalization-unit](normunit)
+* - Uncached Requests
+  - The total number of requests to the L2 that to uncached (UC) memory allocations.  See the [Memory Types section](Mtype) for more detail.
+  - Requests per [normalization-unit](normunit)
+* - Coherently Cached Requests
+  - The total number of requests to the L2 that to coherently cachable (CC) memory allocations.  See the [Memory Types section](Mtype) for more detail.
+  - Requests per [normalization-unit](normunit)
+* - Read/Write Coherent Requests
+  - The total number of requests to the L2 that to Read-Write coherent memory (RW) allocations.  See the [Memory Types section](Mtype) for more detail.
+  - Requests per [normalization-unit](normunit)
+```
+
+```{note}
+All requests to the L2 are for a single cache line's worth of data.
+The size of a cache line may vary depending on the accelerator, however on an AMD Instinct(tm) CDNA2 [MI2XX](2xxnote) accelerator, it is 128B, while on an MI100, it is 64B.
+```
+
+(l2fabric)=
+### L2-Fabric transactions
+
+Requests/data that miss in the L2 must be routed to memory in order to service them.
+The backing memory for a request may be local to this accelerator (i.e., in the local high-bandwidth memory), in a remote accelerator's memory, or even in the CPU's memory.
+Infinity Fabric(tm) is responsible for routing these memory requests/data to the correct location and returning any fetched data to the L2 cache.
+The [following section](L2_req_flow) describes the flow of these requests through Infinity Fabric(tm) in more detail, as described by Omniperf metrics, while [later sections](L2_req_metrics) give detailed definitions of individual metrics.
+
+(L2_req_flow)=
+#### Request flow
+
+Below is a diagram that illustrates how L2↔Fabric requests are reported by Omniperf:
+
+
+```{figure} images/fabric.png
+:alt: L2↔Fabric transaction flow on AMD Instinct(tm) MI accelerators.
+:align: center
+:name: fabric-fig
+
+L2↔Fabric transaction flow on AMD Instinct(tm) MI accelerators.
+```
+
+Requests from the L2 Cache are broken down into two major categories, read requests and write requests (at this granularity, atomic requests are treated as writes).
+
+From there, these requests can additionally subdivided in a number of ways.
+First, these requests may be sent across Infinity Fabric(tm) as different transaction sizes, 32B or 64B on current CDNA accelerators.
+
+```{note}
+On current CDNA accelerators, the 32B read request path is expected to be unused (hence: is disconnected in the flow diagram).
+```
+
+In addition, the read and write requests can be further categorized as:
+  - uncached read/write requests, e.g., for accesses to [fine-grained memory](Mtype)
+  - atomic requests, e.g., for atomic updates to [fine-grained memory](Mtype)
+  - HBM read/write requests OR remote read/write requests, i.e., for requests to the accelerator's local HBM OR requests to a remote accelerator's HBM / the CPU's DRAM.
+
+These classifications are not necessarily _exclusive_, for example, a write request can be classified as an atomic request to the accelerator's local HBM, and an uncached write request.
+The request-flow diagram marks _exclusive_ classifications as a splitting of the flow, while _non-exclusive_ requests do not split the flow line.
+For example, a request is either a 32B Write Request OR a 64B Write request, as the flow splits at this point:
+```{figure} images/split.*
+:scale: 50 %
+:alt: Request flow splitting
+:align: center
+:name: split-request-flow-fig
+
+Splitting request flow
+```
+However, continuing along, the same request might be an Atomic request and an Uncached Write request, as reflected by a non-split flow:
+```{figure} images/nosplit.*
+:scale: 50 %
+:alt: Request flow splitting
+:align: center
+:name: nosplit-request-flow-fig
+
+Non-splitting request flow
+```
+
+Finally, we note that [uncached](Mtype) read requests (e.g., to [fine-grained memory](Mtype)) are handled specially on CDNA accelerators, as indicated in the request flow diagram.
+These are expected to be counted as a 64B Read Request, and _if_ they are requests to uncached memory (denoted by the dashed line), they will also be counted as _two_ uncached read requests (i.e., the request is split):
+
+```{figure} images/uncached.*
+:scale: 50 %
+:alt: Uncached read-request splitting
+:align: center
+:name: uncached-read-request-flow-fig
+
+Uncached read-request splitting.
+```
+
+(L2_req_metrics)=
+#### Metrics
+
+
+The following metrics are reported for the L2-Fabric interface:
+
+```{list-table}
+:header-rows: 1
+:widths: 18 65 17
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - L2-Fabric Read Bandwidth
+  - The total number of bytes read by the L2 cache from Infinity Fabric(tm) per [normalization-unit](normunit).
+  - Bytes per [normalization-unit](normunit)
+* - HBM Read Traffic
+  - The percent of read requests generated by the L2 cache that are routed to the accelerator's local high-bandwidth memory (HBM).  This breakdown does not consider the _size_ of the request (i.e., 32B and 64B requests are both counted as a single request), so this metric only _approximates_ the percent of the L2-Fabric Read bandwidth directed to the local HBM.
+  - Percent
+* - Remote Read Traffic
+  - The percent of read requests generated by the L2 cache that are routed to any memory location other than the accelerator's local high-bandwidth memory (HBM) --- e.g., the CPU's DRAM, a remote accelerator's HBM, etc. This breakdown does not consider the _size_ of the request (i.e., 32B and 64B requests are both counted as a single request), so this metric only _approximates_ the percent of the L2-Fabric Read bandwidth directed to a remote location.
+  - Percent
+* - Uncached Read Traffic
+  - The percent of read requests generated by the L2 cache that are reading from an [uncached memory allocation](Mtype).  Note, as described in the [request-flow](L2_req_flow) section, a single 64B read request is typically counted as two uncached read requests, hence it is possible for the Uncached Read Traffic to reach up to 200% of the total number of read requests.  This breakdown does not consider the _size_ of the request (i.e., 32B and 64B requests are both counted as a single request), so this metric only _approximates_ the percent of the L2-Fabric read bandwidth directed to an uncached memory location.
+  - Percent
+* - L2-Fabric Write and Atomic Bandwidth
+  - The total number of bytes written by the L2 over Infinity Fabric(tm) by write and atomic operations per [normalization-unit](normunit). Note that on current CDNA accelerators, such as the [MI2XX](2xxnote), requests are only considered 'atomic' by Infinity Fabric(tm) if they are targeted at non-write-cachable memory, e.g., [fine-grained memory](Mtype) allocations or [uncached memory](Mtype) allocations on the [MI2XX](2xxnote).
+  - Bytes per [normalization-unit](normunit)
+* - HBM Write and Atomic Traffic
+  - The percent of write and atomic requests generated by the L2 cache that are routed to the accelerator's local high-bandwidth memory (HBM).  This breakdown does not consider the _size_ of the request (i.e., 32B and 64B requests are both counted as a single request), so this metric only _approximates_ the percent of the L2-Fabric Write and Atomic bandwidth directed to the local HBM. Note that on current CDNA accelerators, such as the [MI2XX](2xxnote), requests are only considered 'atomic' by Infinity Fabric(tm) if they are targeted at [fine-grained memory](Mtype) allocations or [uncached memory](Mtype) allocations.
+  - Percent
+* - Remote Write and Atomic Traffic
+  - The percent of write and atomic requests generated by the L2 cache that are routed to any memory location other than the accelerator's local high-bandwidth memory (HBM) --- e.g., the CPU's DRAM, a remote accelerator's HBM, etc.  This breakdown does not consider the _size_ of the request (i.e., 32B and 64B requests are both counted as a single request), so this metric only _approximates_ the percent of the L2-Fabric Write and Atomic bandwidth directed to a remote location. Note that on current CDNA accelerators, such as the [MI2XX](2xxnote), requests are only considered 'atomic' by Infinity Fabric(tm) if they are targeted at non-write-cachable memory, e.g., [fine-grained memory](Mtype) allocations or [uncached memory](Mtype) allocations on the [MI2XX](2xxnote).
+  - Percent
+* - Atomic Traffic
+  - The percent of write requests generated by the L2 cache that are atomic requests to _any_ memory location.  This breakdown does not consider the _size_ of the request (i.e., 32B and 64B requests are both counted as a single request), so this metric only _approximates_ the percent of the L2-Fabric Write and Atomic bandwidth that is due to use of atomics. Note that on current CDNA accelerators, such as the [MI2XX](2xxnote), requests are only considered 'atomic' by Infinity Fabric(tm) if they are targeted at [fine-grained memory](Mtype) allocations or [uncached memory](Mtype) allocations.
+  - Percent
+* - Uncached Write and Atomic Traffic
+  - The percent of write and atomic requests generated by the L2 cache that are targeting [uncached memory allocations](Mtype).  This breakdown does not consider the _size_ of the request (i.e., 32B and 64B requests are both counted as a single request), so this metric only _approximates_ the percent of the L2-Fabric read bandwidth directed to uncached memory allocations.
+  - Percent
+* - Read Latency
+  - The time-averaged number of cycles read requests spent in Infinity Fabric(tm) before data was returned to the L2.
+  - Cycles
+* - Write Latency
+  - The time-averaged number of cycles write requests spent in Infinity Fabric(tm) before a completion acknowledgement was returned to the L2.
+  - Cycles
+* - Atomic Latency
+  - The time-averaged number of cycles atomic requests spent in Infinity Fabric(tm) before a completion acknowledgement (atomic without return value) or data (atomic with return value) was returned to the L2.
+  - Cycles
+* - Read Stall
+  - The ratio of the total number of cycles the L2-Fabric interface was stalled on a read request to any destination (local HBM, remote PCIe(r) connected accelerator / CPU, or remote Infinity Fabric(tm) connected accelerator{sup}`1` / CPU) over the [total active L2 cycles](TotalActiveL2Cycles).
+  - Percent
+* - Write Stall
+  - The ratio of the total number of cycles the L2-Fabric interface was stalled on a write or atomic request to any destination (local HBM, remote accelerator / CPU, PCIe(r) connected accelerator / CPU, or remote Infinity Fabric(tm) connected accelerator{sup}`1` / CPU) over the [total active L2 cycles](TotalActiveL2Cycles).
+  - Percent
+```
+
+(L2_req_metric_details)=
+#### Detailed Transaction Metrics
+
+The following metrics are available in the detailed L2-Fabric transaction breakdown table:
+
+```{list-table}
+:header-rows: 1
+:widths: 18 65 17
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - 32B Read Requests
+  - The total number of L2 requests to Infinity Fabric(tm) to read 32B of data from any memory location, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail. Typically unused on CDNA accelerators.
+  - Requests per [normalization-unit](normunit)
+* - Uncached Read Requests
+  - The total number of L2 requests to Infinity Fabric(tm) to read [uncached data](Mtype) from any memory location, per [normalization-unit](normunit). 64B requests for uncached data are counted as two 32B uncached data requests. See [request-flow](L2_req_flow) for more detail.
+  - Requests per [normalization-unit](normunit)
+* - 64B Read Requests
+  - The total number of L2 requests to Infinity Fabric(tm) to read 64B of data from any memory location, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail.
+  - Requests per [normalization-unit](normunit)
+* - HBM Read Requests
+  - The total number of L2 requests to Infinity Fabric(tm) to read 32B or 64B of data from the accelerator's local HBM, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail.
+  - Requests per [normalization-unit](normunit)
+* - Remote Read Requests
+  - The total number of L2 requests to Infinity Fabric(tm) to read 32B or 64B of data from any source other than the accelerator's local HBM, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail.
+  - Requests per [normalization-unit](normunit)
+* - 32B Write and Atomic Requests
+  - The total number of L2 requests to Infinity Fabric(tm) to write or atomically update 32B of data to any memory location, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail.
+  - Requests per [normalization-unit](normunit)
+* - Uncached Write and Atomic Requests
+  - The total number of L2 requests to Infinity Fabric(tm) to write or atomically update 32B or 64B of [uncached data](Mtype), per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail.
+  - Requests per [normalization-unit](normunit)
+* - 64B Write and Atomic Requests
+  - The total number of L2 requests to Infinity Fabric(tm) to write or atomically update 64B of data in any memory location, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail.
+  - Requests per [normalization-unit](normunit)
+* - HBM Write and Atomic Requests
+  - The total number of L2 requests to Infinity Fabric(tm) to write or atomically update 32B or 64B of data in the accelerator's local HBM, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail.
+  - Requests per [normalization-unit](normunit)
+* - Remote Write and Atomic Requests
+  - The total number of L2 requests to Infinity Fabric(tm) to write or atomically update 32B or 64B of data in any memory location other than the accelerator's local HBM, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail.
+  - Requests per [normalization-unit](normunit)
+* - Atomic Requests
+  - The total number of L2 requests to Infinity Fabric(tm) to atomically update 32B or 64B of data in any memory location, per [normalization-unit](normunit). See [request-flow](L2_req_flow) for more detail.  Note that on current CDNA accelerators, such as the [MI2XX](2xxnote), requests are only considered 'atomic' by Infinity Fabric(tm) if they are targeted at non-write-cachable memory, e.g., [fine-grained memory](Mtype) allocations or [uncached memory](Mtype) allocations on the [MI2XX](2xxnote).
+  - Requests per [normalization-unit](normunit)
+```
+
+### L2-Fabric Interface Stalls
+
+When the interface between the L2 cache and Infinity Fabric(tm) becomes backed up by requests, it may stall preventing the L2 from issuing additional requests to Infinity Fabric(tm) until prior requests complete.
+This section gives a breakdown of what types of requests in a kernel caused a stall (e.g., read vs write), and to which locations (e.g., to the accelerator's local memory, or to remote accelerators/CPUs).
+
+```{list-table}
+:header-rows: 1
+:widths: 20 65 15
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Read - PCIe(r) Stall
+  - The number of cycles the L2-Fabric interface was stalled on read requests to remote PCIe(r) connected accelerators{sup}`1` or CPUs as a percent of the [total active L2 cycles](TotalActiveL2Cycles).
+  - Percent
+* - Read - Infinity Fabric(tm) Stall
+  - The number of cycles the L2-Fabric interface was stalled on read requests to remote Infinity Fabric(tm) connected accelerators{sup}`1` or CPUs as a percent of the [total active L2 cycles](TotalActiveL2Cycles).
+  - Percent
+* - Read - HBM Stall
+  - The number of cycles the L2-Fabric interface was stalled on read requests to the accelerator's local HBM as a percent of the [total active L2 cycles](TotalActiveL2Cycles).
+  - Percent
+* - Write - PCIe(r) Stall
+  - The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote PCIe(r) connected accelerators{sup}`1` or CPUs as a percent of the [total active L2 cycles](TotalActiveL2Cycles).
+  - Percent
+* - Write - Infinity Fabric(tm) Stall
+  - The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote Infinity Fabric(tm) connected accelerators{sup}`1` or CPUs as a percent of the [total active L2 cycles](TotalActiveL2Cycles).
+  - Percent
+* - Write - HBM Stall
+  - The number of cycles the L2-Fabric interface was stalled on write or atomic requests to accelerator's local HBM as a percent of the [total active L2 cycles](TotalActiveL2Cycles).
+  - Percent
+* - Write - Credit Starvation
+  - The number of cycles the L2-Fabric interface was stalled on write or atomic requests to any memory location because too many write/atomic requests were currently in flight, as a percent of the [total active L2 cycles](TotalActiveL2Cycles).
+  - Percent
+```
+
+```{note}
+{sup}`1` In addition to being used for on-accelerator data-traffic, AMD [Infinity Fabric](https://www.amd.com/en/technologies/infinity-architecture)(tm) technology can be used to connect multiple accelerators to achieve advanced peer-to-peer connectivity and enhanced bandwidths over traditional PCIe(r) connections.
+Some AMD Instinct(tm) MI accelerators, e.g., the MI250X, [feature coherent CPU↔accelerator connections built using AMD Infinity Fabric(tm)](https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf)
+```
+
+```{warning}
+On current CDNA accelerators and GCN GPUs, these L2↔Fabric stalls can be undercounted in some circumstances.
+```
+
+(SE)=
+## Shader Engine (SE)
+
+The [CUs](CU) on a CDNA accelerator are grouped together into a higher-level organizational unit called a Shader Engine (SE):
+
+```{figure} images/selayout.png
+:alt: Example of CU-grouping into shader-engines on AMD Instinct(tm) MI accelerators.
+:align: center
+:name: selayout-fig
+
+Example of CU-grouping into shader-engines on AMD Instinct(tm) MI accelerators.
+```
+
+The number of CUs on a SE varies from chip-to-chip (see, for example [AMD GPU HIP Training](https://www.olcf.ornl.gov/wp-content/uploads/2019/09/AMD_GPU_HIP_training_20190906.pdf), slide 20).
+In addition, newer accelerators such as the AMD Instinct(tm) MI 250X have 8 SEs per accelerator.
+
+For the purposes of Omniperf, we consider resources that are shared between multiple CUs on a single SE as part of the SE's metrics.
+These include:
+  - the [scalar L1 data cache](sL1D)
+  - the [L1 instruction cache](L1I)
+  - the [workgroup manager](SPI)
+
+(sL1D)=
+### Scalar L1 Data Cache (sL1D)
+
+The Scalar L1 Data cache (sL1D) can cache data accessed from scalar load instructions (and scalar store instructions on architectures where they exist) from wavefronts in the [CUs](CU).
+The sL1D is shared between multiple CUs ([GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah), slide 36) --- the exact number of CUs depends on the architecture in question (3 CUs in GCN GPUs and MI100, 2 CUs in [MI2XX](2xxnote)) --- and is backed by the [L2](L2) cache.
+
+In typical usage, the data in the sL1D is comprised of (e.g.,):
+  - Kernel arguments, e.g., pointers, [non-populated](https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-sgpr-register-set-up-order-table) grid/block dimensions, etc.
+  - HIP's `__constant__` memory, when accessed in a provably uniform{sup}`1` manner
+  - Other memory, when accessed in a provably uniform manner, *and* the backing memory is provably constant{sup}`1`
+
+```{note}
+{sup}`1`
+The scalar data cache is used when the compiler emits scalar loads to access data.
+This requires that the data be _provably_ uniformly accessed (i.e., the compiler can verify that all work-items in a wavefront access the same data), _and_ that the data can be proven to be read-only (e.g., HIP's `__constant__` memory, or properly `__restrict__`'ed pointers to avoid write-aliasing).
+Access of e.g., `__constant__` memory is not guaranteed to go through the sL1D if, e.g., the wavefront loads a non-uniform value.
+```
+
+(sL1D_SOL)=
+#### Scalar L1D Speed-of-Light
+
+```{warning}
+The theoretical maximum throughput for some metrics in this section are currently computed with the maximum achievable clock frequency, as reported by `rocminfo`, for an accelerator.  This may not be realistic for all workloads.
+```
+
+The Scalar L1D speed-of-light chart shows some key metrics of the sL1D cache as a comparison with the peak achievable values of those metrics:
+
+```{list-table}
+:header-rows: 1
+:widths: 20 65 15
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Bandwidth
+  - The number of bytes looked up in the sL1D cache, as a percent of the peak theoretical bandwidth. Calculated as the ratio of sL1D requests over the [total sL1D cycles](TotalSL1DCycles). 
+  - Percent
+* - Cache Hit Rate
+  - The percent of sL1D requests that hit{sup}`1` on a previously loaded line in the cache. Calculated as the ratio of the number of sL1D requests that hit over the number of all sL1D requests.
+  - Percent
+* - sL1D-L2 BW
+  - The number of bytes requested by the sL1D from the L2 cache, as a percent of the peak theoretical sL1D → L2 cache bandwidth.  Calculated as the ratio of the total number of requests from the sL1D to the L2 cache over the [total sL1D-L2 interface cycles](TotalSL1DCycles).
+  - Percent
+```
+
+```{note}
+{sup}`1` Unlike the [vL1D](vL1D) and [L2](L2) caches, the sL1D cache on AMD Instinct(tm) MI CDNA accelerators does _not_ use "hit-on-miss" approach to reporting cache hits.
+That is, if while satisfying a miss, another request comes in that would hit on the same pending cache line, the subsequent request will be counted as a 'duplicated miss' (see below).
+```
+
+#### Scalar L1D Cache Accesses
+
+This panel gives more detail on the types of accesses made to the sL1D, and the hit/miss statistics.
+
+```{list-table}
+:header-rows: 1
+:widths: 18 65 17
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Requests
+  - The total number of requests, of any size or type, made to the sL1D per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+* - Hits
+  - The total number of sL1D requests that hit on a previously loaded cache line, per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+* - Misses - Non Duplicated
+  - The total number of sL1D requests that missed on a cache line that *was not* already pending due to another request, per [normalization-unit](normunit). See note in [speed-of-light section](sL1D_SOL) for more detail.
+  - Requests per [normalization-unit](normunit)
+* - Misses - Duplicated
+  - The total number of sL1D requests that missed on a cache line that *was* already pending due to another request, per [normalization-unit](normunit). See note in [speed-of-light section](sL1D_SOL) for more detail.
+  - Requests per [normalization-unit](normunit)
+* - Cache Hit Rate
+  - Indicates the percent of sL1D requests that hit on a previously loaded line the cache. The ratio of the number of sL1D requests that hit{sup}`1` over the number of all sL1D requests.
+  - Percent
+* - Read Requests (Total)
+  - The total number of sL1D read requests of any size, per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+* - Atomic Requests
+  - The total number of sL1D atomic requests of any size, per [normalization-unit](normunit).  Typically unused on CDNA accelerators.
+  - Requests per [normalization-unit](normunit)
+* - Read Requests (1 DWord)
+  - The total number of sL1D read requests made for a single dword of data (4B), per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+* - Read Requests (2 DWord)
+  - The total number of sL1D read requests made for a two dwords of data (8B), per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+* - Read Requests (4 DWord)
+  - The total number of sL1D read requests made for a four dwords of data (16B), per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+* - Read Requests (8 DWord)
+  - The total number of sL1D read requests made for a eight dwords of data (32B), per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+* - Read Requests (16 DWord)
+  - The total number of sL1D read requests made for a sixteen dwords of data (64B), per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+```
+
+```{note}
+{sup}`1`Unlike the [vL1D](vL1D) and [L2](L2) caches, the sL1D cache on AMD Instinct(tm) MI CDNA accelerators does _not_ use "hit-on-miss" approach to reporting cache hits.
+That is, if while satisfying a miss, another request comes in that would hit on the same pending cache line, the subsequent request will be counted as a 'duplicated miss' (see below).
+```
+
+#### sL1D ↔ L2 Interface
+
+This panel gives more detail on the data requested across the sL1D↔[L2](L2) interface.
+
+```{list-table}
+:header-rows: 1
+:widths: 18 65 17
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - sL1D-L2 BW
+  - The total number of bytes read from/written to/atomically updated across the sL1D↔[L2](L2) interface, per [normalization-unit](normunit).  Note that sL1D writes and atomics are typically unused on current CDNA accelerators, so in the majority of cases this can be interpreted as an sL1D→L2 read bandwidth.
+  - Bytes per [normalization-unit](normunit)
+* - Read Requests
+  - The total number of read requests from sL1D to the [L2](L2), per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+* - Write Requests
+  - The total number of write requests from sL1D to the [L2](L2), per [normalization-unit](normunit).  Typically unused on current CDNA accelerators.
+  - Requests per [normalization-unit](normunit)
+* - Atomic Requests
+  - The total number of atomic requests from sL1D to the [L2](L2), per [normalization-unit](normunit).  Typically unused on current CDNA accelerators.
+  - Requests per [normalization-unit](normunit)
+* - Stall Cycles
+  - The total number of cycles the sL1D↔[L2](L2) interface was stalled, per [normalization-unit](normunit).
+  - Cycles per [normalization-unit](normunit)
+```
+
+(L1I)=
+### L1 Instruction Cache (L1I)
+
+As with the [sL1D](sL1D), the L1 Instruction (L1I) cache is shared between multiple CUs on a shader-engine, where the precise number of CUs sharing a L1I depends on the architecture in question ([GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah), slide 36) and is backed by the [L2](L2) cache.
+Unlike the sL1D, the instruction cache is read-only.
+
+(L1I_SOL)=
+#### L1I Speed-of-Light
+
+```{warning}
+The theoretical maximum throughput for some metrics in this section are currently computed with the maximum achievable clock frequency, as reported by `rocminfo`, for an accelerator.  This may not be realistic for all workloads.
+```
+
+The L1 Instruction Cache speed-of-light chart shows some key metrics of the L1I cache as a comparison with the peak achievable values of those metrics:
+
+```{list-table}
+:header-rows: 1
+:widths: 15 70 15
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Bandwidth
+  - The number of bytes looked up in the L1I cache, as a percent of the peak theoretical bandwidth. Calculated as the ratio of L1I requests over the [total L1I cycles](TotalL1ICycles). 
+  - Percent
+* - Cache Hit Rate
+  - The percent of L1I requests that hit on a previously loaded line the cache. Calculated as the ratio of the number of L1I requests that hit{sup}`1` over the number of all L1I requests.
+  - Percent
+* - L1I-L2 BW
+  - The percent of the peak theoretical L1I → L2 cache request bandwidth achieved.  Calculated as the ratio of the total number of requests from the L1I to the L2 cache over the [total L1I-L2 interface cycles](TotalL1ICycles).
+  - Percent
+* - Instruction Fetch Latency
+  - The average number of cycles spent to fetch instructions to a [CU](cu).
+  - Cycles
+```
+
+```{note}
+{sup}`1`Unlike the [vL1D](vL1D) and [L2](L2) caches, the L1I cache on AMD Instinct(tm) MI CDNA accelerators does _not_ use "hit-on-miss" approach to reporting cache hits.
+That is, if while satisfying a miss, another request comes in that would hit on the same pending cache line, the subsequent request will be counted as a 'duplicated miss' (see below).
+```
+
+#### L1I Cache Accesses
+
+This panel gives more detail on the hit/miss statistics of the L1I:
+
+```{list-table}
+:header-rows: 1
+:widths: 18 65 17
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Requests
+  - The total number of requests made to the L1I per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+* - Hits
+  - The total number of L1I requests that hit on a previously loaded cache line, per [normalization-unit](normunit).
+  - Requests per [normalization-unit](normunit)
+* - Misses - Non Duplicated
+  - The total number of L1I requests that missed on a cache line that *was not* already pending due to another request, per [normalization-unit](normunit). See note in [speed-of-light section](L1I_SOL) for more detail.
+  - Requests per [normalization-unit](normunit)
+* - Misses - Duplicated
+  - The total number of L1I requests that missed on a cache line that *was* already pending due to another request, per [normalization-unit](normunit). See note in [speed-of-light section](L1I_SOL) for more detail.
+  - Requests per [normalization-unit](normunit)
+* - Cache Hit Rate
+  - The percent of L1I requests that hit{sup}`1` on a previously loaded line the cache. Calculated as the ratio of the number of L1I requests that hit over the the number of all L1I requests.
+  - Percent
+```
+
+```{note}
+{sup}`1`Unlike the [vL1D](vL1D) and [L2](L2) caches, the L1I cache on AMD Instinct(tm) MI CDNA accelerators does _not_ use "hit-on-miss" approach to reporting cache hits.
+That is, if while satisfying a miss, another request comes in that would hit on the same pending cache line, the subsequent request will be counted as a 'duplicated miss' (see below).
+```
+
+#### L1I - L2 Interface
+
+This panel gives more detail on the data requested across the L1I-[L2](L2) interface.
+
+```{list-table}
+:header-rows: 1
+:widths: 18 65 17
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - L1I-L2 BW
+  - The total number of bytes read across the L1I-[L2](L2) interface, per [normalization-unit](normunit).
+  - Bytes per [normalization-unit](normunit)
+```
+
+(SPI)=
+### Workgroup manager (SPI)
+
+The workgroup manager (SPI) is the bridge between the [command processor](CP) and the [compute units](CU).
+After the [command processor](cp) processes a kernel dispatch, it will then pass the dispatch off to the workgroup manager, which then schedules [workgroups](workgroup) onto the [compute units](CU).
+As workgroups complete execution and resources become available, the workgroup manager will schedule new workgroups onto [compute units](CU).
+The workgroup manager's metrics therefore are focused on reporting, e.g.:
+
+  - Utilizations of various parts of the accelerator that the workgroup manager interacts with (and the workgroup manager itself)
+  - How many workgroups were dispatched, their size, and how many resources they used
+  - Percent of scheduler opportunities (cycles) where workgroups failed to dispatch, and
+  - Percent of scheduler opportunities (cycles) where workgroups failed to dispatch due to lack of a specific resource on the CUs (e.g., too many VGPRs allocated)
+
+This gives the user an idea of why the workgroup manager couldn't schedule more wavefronts onto the device, and is most useful for workloads that the user suspects to be scheduling/launch-rate limited.
+
+As discussed in the [command processor](cp) description, the command processor on AMD Instinct(tm) MI architectures contains four hardware scheduler-pipes, each with eight software threads ([“Vega10” - Mantor](https://old.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.120-Radeon-Vega10-Mantor-AMD-f1.pdf), slide 19).
+Each scheduler-pipe can issue a kernel dispatch to the workgroup manager to schedule concurrently.
+Therefore, some workgroup manager metrics are presented relative to the utilization of these scheduler-pipes (e.g., whether all four are issuing concurrently).
+
+```{note}
+Current versions of the profiling libraries underlying Omniperf attempt to serialize concurrent kernels running on the accelerator, as the performance counters on the device are global (i.e., shared between concurrent kernels).
+This means that these scheduler-pipe utilization metrics are expected to reach e.g., a maximum of one pipe active, i.e., only 25\%.
+```
+
+#### Workgroup Manager Utilizations
+
+This section describes the utilization of the workgroup manager, and the hardware components it interacts with.
+
+```{list-table}
+:header-rows: 1
+:widths: 20 65 15
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Accelerator Utilization
+  - The percent of cycles in the kernel where the accelerator was actively doing any work.
+  - Percent
+* - Scheduler-Pipe Utilization
+  - The percent of [total scheduler-pipe cycles](TotalPipeCycles) in the kernel where the scheduler-pipes were actively doing any work.  Note: this value is expected to range between 0-25%, see note in [workgroup-manager](SPI) description.
+  - Percent
+* - Workgroup Manager Utilization
+  - The percent of cycles in the kernel where the Workgroup Manager was actively doing any work.
+  - Percent
+* - Shader Engine Utilization
+  - The percent of [total shader-engine cycles](TotalSECycles) in the kernel where any CU in a shader-engine was actively doing any work, normalized over all shader-engines.  Low values (e.g., << 100%) indicate that the accelerator was not fully saturated by the kernel, or a potential load-imbalance issue.
+  - Percent
+* - SIMD Utilization
+  - The percent of [total SIMD cycles](TotalSIMDCycles) in the kernel where any [SIMD](VALU) on a CU was actively doing any work, summed over all CUs.  Low values (e.g., << 100%) indicate that the accelerator was not fully saturated by the kernel, or a potential load-imbalance issue.
+  - Percent
+* - Dispatched Workgroups
+  - The total number of workgroups forming this kernel launch.
+  - Workgroups
+* - Dispatched Wavefronts
+  - The total number of wavefronts, summed over all workgroups, forming this kernel launch.
+  - Wavefronts
+* - VGPR Writes
+  - The average number of cycles spent initializing [VGPRs](valu) at wave creation.
+  - Cycles/wave
+* - SGPR Writes
+  - The average number of cycles spent initializing [SGPRs](salu) at wave creation.
+  - Cycles/wave
+```
+
+#### Workgroup Manager - Resource Allocation
+
+This panel gives more detail on how workgroups/wavefronts were scheduled onto compute units, and what occupancy limiters they hit (if any).
+When analyzing these metrics, the user should also take into account their achieved occupancy (i.e., [Wavefront occupancy](Wavefront_runtime_stats)).
+A kernel may be occupancy limited by e.g., LDS usage, but may still achieve high occupancy levels such that improving occupancy further may not improve performance.
+See the [Workgroup Manager - Occupancy Limiters](Occupancy_example) example for more details.
+
+```{list-table}
+:header-rows: 1
+:widths: 20 65 15
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - Not-scheduled Rate (Workgroup Manager)
+  - The percent of [total scheduler-pipe cycles](TotalPipeCycles) in the kernel where a workgroup could not be scheduled to a [CU](CU) due to a bottleneck within the workgroup manager rather than a lack of a [CU](CU)/[SIMD](VALU) with sufficient resources.  Note: this value is expected to range between 0-25%, see note in [workgroup-manager](SPI) description.
+  - Percent
+* - Not-scheduled Rate (Scheduler-Pipe)
+  - The percent of [total scheduler-pipe cycles](TotalPipeCycles) in the kernel where a workgroup could not be scheduled to a [CU](CU) due to a bottleneck within the scheduler-pipes rather than a lack of a [CU](CU)/[SIMD](VALU) with sufficient resources.  Note: this value is expected to range between 0-25%, see note in [workgroup-manager](SPI) description.
+  - Percent
+* - Scheduler-Pipe Stall Rate
+  - The percent of [total scheduler-pipe cycles](TotalPipeCycles) in the kernel where a workgroup could not be scheduled to a [CU](CU) due to occupancy limitations (i.e., a lack of a [CU](CU)/[SIMD](VALU) with sufficient resources).  Note: this value is expected to range between 0-25%, see note in [workgroup-manager](SPI) description.
+  - Percent
+* - Scratch Stall Rate
+  - The percent of [total shader-engine cycles](TotalSECycles) in the kernel where a workgroup could not be scheduled to a [CU](CU) due to lack of [private (a.k.a., scratch) memory](Mtype) slots.  While this can reach up to 100\%, we note that the actual occupancy limitations on a kernel using private memory are typically quite small (e.g., <1\% of the total number of waves that can be scheduled to an accelerator).
+  - Percent
+* - Insufficient SIMD Waveslots
+  - The percent of [total SIMD cycles](TotalSIMDCycles) in the kernel where a workgroup could not be scheduled to a [SIMD](valu) due to lack of available [waveslots](valu).
+  - Percent
+* - Insufficient SIMD VGPRs
+  - The percent of [total SIMD cycles](TotalSIMDCycles) in the kernel where a workgroup could not be scheduled to a [SIMD](valu) due to lack of available [VGPRs](valu).
+  - Percent
+* - Insufficient SIMD SGPRs
+  - The percent of [total SIMD cycles](TotalSIMDCycles) in the kernel where a workgroup could not be scheduled to a [SIMD](valu) due to lack of available [SGPRs](salu).
+  - Percent
+* - Insufficient CU LDS
+  - The percent of [total CU cycles](TotalCUCycles) in the kernel where a workgroup could not be scheduled to a [CU](cu) due to lack of available [LDS](lds).
+  - Percent
+* - Insufficient CU Barriers
+  - The percent of [total CU cycles](TotalCUCycles) in the kernel where a workgroup could not be scheduled to a [CU](cu) due to lack of available [barriers](barrier).
+  - Percent
+* - Reached CU Workgroup Limit
+  - The percent of [total CU cycles](TotalCUCycles) in the kernel where a workgroup could not be scheduled to a [CU](cu) due to limits within the workgroup manager.  This is expected to be always be zero on CDNA2 or newer accelerators (and small for previous accelerators).
+  - Percent
+* - Reached CU Wavefront Limit
+  - The percent of [total CU cycles](TotalCUCycles) in the kernel where a wavefront could not be scheduled to a [CU](cu) due to limits within the workgroup manager.  This is expected to be always be zero on CDNA2 or newer accelerators (and small for previous accelerators).
+  - Percent
+```
+
+(CP)=
+## Command Processor (CP)
+
+The command processor -- a.k.a., the CP -- is responsible for interacting with the AMDGPU Kernel Driver (a.k.a., the Linux Kernel) on the CPU and for interacting with user-space HSA clients when they submit commands to HSA queues.
+Basic tasks of the CP include reading commands (e.g., corresponding to a kernel launch) out of [HSA Queues](http://hsafoundation.com/wp-content/uploads/2021/02/HSA-Runtime-1.2.pdf) (Sec. 2.5), scheduling work to subsequent parts of the scheduler pipeline, and marking kernels complete for synchronization events on the host.
+
+The command processor is composed of two sub-components:
+
+  - Fetcher (CPF): Fetches commands out of memory to hand them over to the CPC for processing
+  - Packet Processor (CPC): The micro-controller running the command processing firmware that decodes the fetched commands, and (for kernels) passes them to the [Workgroup Processors](SPI) for scheduling
+
+Before scheduling work to the accelerator, the command-processor can first acquire a memory fence to ensure system consistency [(Sec 2.6.4)](http://hsafoundation.com/wp-content/uploads/2021/02/HSA-Runtime-1.2.pdf).
+After the work is complete, the command-processor can apply a memory-release fence.
+Depending on the AMD CDNA accelerator under question, either of these operations _may_ initiate a cache write-back or invalidation.
+
+Analyzing command processor performance is most interesting for kernels that the user suspects to be scheduling/launch-rate limited.
+The command processor's metrics therefore are focused on reporting, e.g.:
+
+  - Utilization of the fetcher
+  - Utilization of the packet processor, and decoding processing packets
+  - Fetch/processing stalls
+
+### Command Processor Fetcher (CPF) Metrics
+
+```{list-table}
+:header-rows: 1
+:widths: 20 65 15
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - CPF Utilization
+  - Percent of total cycles where the CPF was busy actively doing any work.  The ratio of CPF busy cycles over total cycles counted by the CPF.
+  - Percent
+* - CPF Stall
+  - Percent of CPF busy cycles where the CPF was stalled for any reason.
+  - Percent
+* - CPF-L2 Utilization
+  - Percent of total cycles counted by the CPF-[L2](L2) interface where the CPF-L2 interface was active doing any work.  The ratio of CPF-L2 busy cycles over total cycles counted by the CPF-L2.
+  - Percent
+* - CPF-L2 Stall
+  - Percent of CPF-L2 busy cycles where the CPF-[L2](L2) interface was stalled for any reason. 
+  - Percent
+* - CPF-UTCL1 Stall
+  - Percent of CPF busy cycles where the CPF was stalled by address translation. 
+  - Percent
+```
+
+### Command Processor Packet Processor (CPC) Metrics
+
+```{list-table}
+:header-rows: 1
+:widths: 20 65 15
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - CPC Utilization
+  - Percent of total cycles where the CPC was busy actively doing any work.  The ratio of CPC busy cycles over total cycles counted by the CPC.
+  - Percent
+* - CPC Stall
+  - Percent of CPC busy cycles where the CPC was stalled for any reason.
+  - Percent
+* - CPC Packet Decoding Utilization
+  - Percent of CPC busy cycles spent decoding commands for processing.
+  - Percent
+* - CPC-Workgroup Manager Utilization
+  - Percent of CPC busy cycles spent dispatching workgroups to the [Workgroup Manager](SPI).
+  - Percent
+* - CPC-L2 Utilization
+  - Percent of total cycles counted by the CPC-[L2](L2) interface where the CPC-L2 interface was active doing any work.
+  - Percent
+* - CPC-UTCL1 Stall
+  - Percent of CPC busy cycles where the CPC was stalled by address translation.
+  - Percent
+* - CPC-UTCL2 Utilization
+  - Percent of total cycles counted by the CPC's L2 address translation interface where the CPC was busy doing address translation work.
+  - Percent
+```
+
+## System Speed-of-Light
+
+```{warning}
+The theoretical maximum throughput for some metrics in this section are currently computed with the maximum achievable clock frequency, as reported by `rocminfo`, for an accelerator.  This may not be realistic for all workloads.
+
+In addition, not all metrics (e.g., FLOP counters) are available on all AMD Instinct(tm) MI accelerators.
+For more detail on how operations are counted, see the [FLOP counting convention](FLOP_count) section.
+```
+
+Finally, the system speed-of-light summarizes some of the key metrics from various sections of Omniperf's profiling report.
+
+```{list-table}
+:header-rows: 1
+:widths: 20 65 15
+:class: noscroll-table
+* - Metric
+  - Description
+  - Unit
+* - [VALU](valu) FLOPs
+  - The total floating-point operations executed per second on the [VALU](valu).  This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from [MFMA](mfma) instructions.
+  - GFLOPs
+* - [VALU](valu) IOPs
+  - The total integer operations executed per second on the [VALU](valu).  This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from [MFMA](mfma) instructions.
+  - GIOPs
+* - [MFMA](mfma) FLOPs (BF16)
+  - The total number of 16-bit brain floating point [MFMA](mfma) operations executed per second. Note: this does not include any 16-bit brain floating point operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the specific accelerator.
+  - GFLOPs
+* - [MFMA](mfma) FLOPs (F16)
+  - The total number of 16-bit floating point [MFMA](mfma) operations executed per second. Note: this does not include any 16-bit floating point operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific accelerator.
+  - GFLOPs
+* - [MFMA](mfma) FLOPs (F32)
+  - The total number of 32-bit floating point [MFMA](mfma) operations executed per second. Note: this does not include any 32-bit floating point operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific accelerator.
+  - GFLOPs
+* - [MFMA](mfma) FLOPs (F64)
+  - The total number of 64-bit floating point [MFMA](mfma) operations executed per second. Note: this does not include any 64-bit floating point operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+  - GFLOPs
+* - [MFMA](mfma) IOPs (INT8)
+  - The total number of 8-bit integer [MFMA](mfma) operations executed per second. Note: this does not include any 8-bit integer operations from [VALU](valu) instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
+  - GIOPs
+* - [SALU](salu) Utilization
+  - Indicates what percent of the kernel's duration the [SALU](salu) was busy executing instructions.  Computed as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [SALU](salu) / [SMEM](salu) instructions over the [total CU cycles](TotalCUCycles).
+  - Percent
+* - [VALU](valu) Utilization
+  - Indicates what percent of the kernel's duration the [VALU](valu) was busy executing instructions.  Does not include [VMEM](valu) operations.  Computed as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [VALU](valu) instructions over the [total CU cycles](TotalCUCycles).
+  - Percent
+* - [MFMA](mfma) Utilization
+  - Indicates what percent of the kernel's duration the [MFMA](mfma) unit was busy executing instructions.  Computed as the ratio of the total number of cycles the [MFMA](mfma) was busy over the [total CU cycles](TotalCUCycles).
+  - Percent
+* - [VMEM](valu) Utilization
+  - Indicates what percent of the kernel's duration the [VMEM](valu) unit was busy executing instructions, including both global/generic and spill/scratch operations (see the [VMEM instruction count metrics](TA_inst) for more detail).  Does not include [VALU](valu) operations.  Computed as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [VMEM](valu) instructions over the [total CU cycles](TotalCUCycles).
+  - Percent
+* - [Branch](branch) Utilization
+  - Indicates what percent of the kernel's duration the [Branch](branch) unit was busy executing instructions. Computed as the ratio of the total number of cycles spent by the [scheduler](scheduler) issuing [Branch](branch) instructions over the [total CU cycles](TotalCUCycles).
+  - Percent
+* - [VALU](valu) Active Threads
+  - Indicates the average level of [divergence](Divergence) within a wavefront over the lifetime of the kernel. The number of work-items that were active in a wavefront during execution of each [VALU](valu) instruction, time-averaged over all VALU instructions run on all wavefronts in the kernel.
+  - Work-items
+* - IPC
+  - The ratio of the total number of instructions executed on the [CU](cu) over the [total active CU cycles](TotalActiveCUCycles). This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+  - Instructions per-cycle
+* - Wavefront Occupancy
+  - The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (<< 1ms).   This is also presented as a percent of the peak theoretical occupancy achievable on the specific accelerator.
+  - Wavefronts
+* - [LDS](lds) Theoretical Bandwidth
+  - Indicates the maximum amount of bytes that could have been loaded from/stored to/atomically updated in the LDS per unit time (see [LDS Bandwidth](lds_bandwidth) example for more detail).  This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+  - GB/s
+* - [LDS](lds) Bank Conflicts/Access
+  - The ratio of the number of cycles spent in the [LDS scheduler](lds) due to bank conflicts (as determined by the conflict resolution hardware) to the base number of cycles that would be spent in the LDS scheduler in a completely uncontended case.  This is also presented in normalized form (i.e., the Bank Conflict Rate).
+  - Conflicts/Access
+* - [vL1D](vL1D) Cache Hit Rate
+  - The ratio of the number of vL1D cache line requests that hit in vL1D cache over the total number of cache line requests to the [vL1D Cache RAM](TC).
+  - Percent
+* - [vL1D](vL1D) Cache Bandwidth
+  - The number of bytes looked up in the vL1D cache as a result of [VMEM](VALU) instructions per unit time.  The number of bytes is calculated as the number of cache lines requested multiplied by the cache line size.  This value does not consider partial requests, so e.g., if only a single value is requested in a cache line, the data movement will still be counted as a full cache line.  This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+  - GB/s
+* - [L2](L2) Cache Hit Rate
+  - The ratio of the number of L2 cache line requests that hit in the L2 cache over the total number of incoming cache line requests to the L2 cache.
+  - Percent
+* - [L2](L2) Cache Bandwidth
+  - The number of bytes looked up in the L2 cache per unit time.  The number of bytes is calculated as the number of cache lines requested multiplied by the cache line size.  This value does not consider partial requests, so e.g., if only a single value is requested in a cache line, the data movement will still be counted as a full cache line.  This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+  - GB/s
+* - [L2](L2)-Fabric Read BW
+  - The number of bytes read by the L2 over the [Infinity Fabric(tm) interface](l2fabric) per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+  - GB/s
+* - [L2](L2)-Fabric Write and Atomic BW
+  - The number of bytes sent by the L2 over the [Infinity Fabric(tm) interface](l2fabric) by write and atomic operations per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+  - GB/s
+* - [L2](L2)-Fabric Read Latency
+  - The time-averaged number of cycles read requests spent in Infinity Fabric(tm) before data was returned to the L2.
+  - Cycles
+* - [L2](L2)-Fabric Write Latency
+  - The time-averaged number of cycles write requests spent in Infinity Fabric(tm) before a completion acknowledgement was returned to the L2.
+  - Cycles
+* - [sL1D](sL1D) Cache Hit Rate
+  - The percent of sL1D requests that hit on a previously loaded line the cache. Calculated as the ratio of the number of sL1D requests that hit over the number of all sL1D requests.
+  - Percent
+* - [sL1D](sL1D) Bandwidth
+  - The number of bytes looked up in the sL1D cache per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. 
+  - GB/s
+* - [L1I](L1I) Bandwidth
+  - The number of bytes looked up in the L1I cache per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. 
+  - GB/s
+* - [L1I](L1I) Cache Hit Rate
+  - The percent of L1I requests that hit on a previously loaded line the cache. Calculated as the ratio of the number of L1I requests that hit over the number of all L1I requests.
+  - Percent
+* - [L1I](L1I) Fetch Latency
+  - The average number of cycles spent to fetch instructions to a [CU](cu).
+  - Cycles
+```
+
+## References
+
+- [AMD GPU HIP Training](https://www.olcf.ornl.gov/wp-content/uploads/2019/09/AMD_GPU_HIP_training_20190906.pdf)
+- [CDNA2 ISA Documentation](https://developer.amd.com/wp-content/resources/CDNA2_Shader_ISA_4February2022.pdf)
+- [HSA Runtime Programmer’s Reference Manual](http://hsafoundation.com/wp-content/uploads/2021/02/HSA-Runtime-1.2.pdf)
+- [GS-4106 The AMD GCN Architecture - A Crash Course, by Layla Mah](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah)
+- [AMD RADEON™ HD 7970 WITH GRAPHICS CORE NEXT (GCN) ARCHITECTURE, by Mike Mantor](https://old.hotchips.org/wp-content/uploads/hc_archives/hc24/HC24-3-ManyCore/HC24.28.315-AMD.GCN.mantor_v1.pdf)
+- [AMD’s Radeon Next Generation GPU Architecture “Vega10”, by Mike Mantor](https://old.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.120-Radeon-Vega10-Mantor-AMD-f1.pdf)
+- [CDNA2 Whitepaper](https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf)
+- [LLVM's User Guide for AMDGPU Backend](https://llvm.org/docs/AMDGPUUsage.html)
+
+## Disclaimer
+
+PCIe(r) is a registered trademark of PCI-SIG Corporation.
+
+
+# Definitions
+
+## Miscellaneous
+
+(TotalActiveCUCycles)=
+(TotalCUCycles)=
+(TotalSL1DCycles)=
+(TotalL1ICycles)=
+(TotalL2Cycles)=
+(TotalActiveL2Cycles)=
+(TotalPipeCycles)=
+(TotalSECycles)=
+(TotalSIMDCycles)=
+(ThreadRequests)=
+(Wavefront)=
+(Workitem)=
+(Workgroup)=
+(Divergence)=
+(KernelCycles)=
+(KernelTime)=
+
+```{list-table}
+:header-rows: 1
+:widths: 20 65 15
+:class: noscroll-table
+* - Name
+  - Description
+  - Unit
+* - Kernel Time
+  - The number of seconds the accelerator was executing a kernel, from the [Command Processor](CP)'s start-of-kernel timestamp (which is a number of cycles after the CP begins processing the packet) to the CP's end-of-kernel timestamp (which is a number of cycles before the CP stops processing the packet.
+  - Seconds
+* - Kernel Cycles
+  - The number of cycles the accelerator was active doing _any_ work, as measured by the [Command Processor](CP).
+  - Cycles
+* - Total CU Cycles
+  - The number of cycles the accelerator was active doing _any_ work (i.e., Kernel Cycles), multiplied by the number of [compute units](CU) on the accelerator.  A measure of the total possible active cycles the compute units could be doing work, useful for normalization of metrics inside the CU.
+  - Cycles
+* - Total Active CU Cycles
+  - The number of cycles a CU on the accelerator was active doing _any_ work, summed over all [compute units](CU) on the accelerator.
+  - Cycles
+* - Total SIMD Cycles
+  - The number of cycles the accelerator was active doing _any_ work (i.e., Kernel Cycles), multiplied by the number of [SIMDs](CU) on the accelerator.  A measure of the total possible active cycles the SIMDs could be doing work, useful for normalization of metrics inside the CU.
+  - Cycles
+* - Total L2 Cycles
+  - The number of cycles the accelerator was active doing _any_ work (i.e., Kernel Cycles), multiplied by the number of [L2](L2) channels on the accelerator.  A measure of the total possible active cycles the L2 channels could be doing work, useful for normalization of metrics inside the L2.
+  - Cycles
+* - Total Active L2 Cycles
+  - The number of cycles a channel of the L2 cache was active doing _any_ work, summed over all [L2](L2) channels on the accelerator.
+  - Cycles
+* - Total sL1D Cycles
+  - The number of cycles the accelerator was active doing _any_ work (i.e., Kernel Cycles), multiplied by the number of [scalar L1 Data caches](sL1D) on the accelerator.  A measure of the total possible active cycles the sL1Ds could be doing work, useful for normalization of metrics inside the sL1D.
+  - Cycles
+* - Total L1I Cycles
+  - The number of cycles the accelerator was active doing _any_ work (i.e., Kernel Cycles), multiplied by the number of [L1 Instruction caches](L1I) on the accelerator.  A measure of the total possible active cycles the L1Is could be doing work, useful for normalization of metrics inside the L1I.
+  - Cycles
+* - Total Scheduler-Pipe Cycles
+  - The number of cycles the accelerator was active doing _any_ work (i.e., Kernel Cycles), multiplied by the number of [scheduler pipes](CP) on the accelerator.  A measure of the total possible active cycles the scheduler-pipes could be doing work, useful for normalization of metrics inside the [workgroup manager](SPI) and [command processor](CP).
+  - Cycles
+* - Total Shader-Engine Cycles
+  - The total number of cycles the accelerator was active doing _any_ work, multiplied by the number of [Shader Engines](SE) on the accelerator.  A measure of the total possible active cycles the Shader Engines could be doing work, useful for normalization of metrics inside the [workgroup manager](SPI).
+  - Cycles
+* - Thread-requests
+  - The number of unique memory addresses accessed by a single memory instruction.  On AMD's Instinct(tm) accelerators, this a maximum of 64 (i.e., the size of the wavefront).
+  - Addresses
+* - Work-item
+  - A single 'thread' (lane) of execution, that executes in lockstep with the rest of the work-items comprising a [wavefront](Wavefront) of execution.
+  - N/A
+* - Wavefront
+  - A group of work-items, or threads, that execute in lockstep on the [compute-unit](CU). On AMD's Instinct(tm) accelerators, the wavefront size is always 64 work-items.
+  - N/A
+* - Workgroup
+  - A group of wavefronts that execute on the same [compute-unit](CU), and can cooperatively execute and share data via the use of synchronization primitives, [LDS](lds), atomics, etc.
+  - N/A
+* - Divergence
+  - Divergence within a wavefront occurs when not all work-items are active when executing an instruction, e.g., due to non-uniform control flow within a wavefront. Can reduce overall execution efficiency by causing e.g., the [VALU](valu) to have to execute both branches of a conditional with different sets of work-items active.
+  - N/A
+```
+
+(normunit)=
+## Normalization units
+
+A user-configurable unit by which the user can choose to normalize data.  Choices include:
+
+```{list-table}
+:header-rows: 1
+:widths: 20 80
+:class: noscroll-table
+* - Name
+  - Description
+* - `per_cycle`
+  - The total value of the measured counter/metric that occurred per kernel invocation divided by the [Kernel Cycles](KernelCycles), i.e., total number of cycles the kernel executed as measured by the [Command Processor](CP).
+* - `per_wave`
+  - The total value of the measured counter/metric that occurred per kernel invocation divided by the total number of [wavefronts](wavefront) launched in the kernel.
+* - `per_kernel`
+  - The total value of the measured counter/metric that occurred per kernel invocation.
+* - `per_second`
+  - The total value of the measured counter/metric that occurred per kernel invocation divided by the [Kernel Time](KernelTime), i.e., the total runtime of the kernel in seconds, as measured by the [Command Processor](CP).
+```
+
+By default, Omniperf uses the `per_wave` normalization. The appropriate normalization will vary depending on your use case.
+For instance, a `per_second` normalization may be useful for FLOP or bandwidth comparisons, while a `per_wave` normalization may be useful (e.g.,) to see how many (and what types) of instructions are used per wavefront, and a `per_kernel` normalization may be useful to get the total aggregate values of metrics for comparison between different configurations.
+
+(Mspace)=
+## Memory Spaces
+
+AMD Instinct(tm) MI accelerators can access memory through multiple address spaces which may map to different physical memory locations on the system.
+The [table below](mspace-table) provides a view of how various types of memory used in HIP map onto these constructs:
+
+```{list-table} Memory / Address space terminology
+:header-rows: 1
+:name: mspace-table
+:class: noscroll-table
+
+* - LLVM Address Space
+  - Hardware Memory Space
+  - HIP Terminology
+* - Generic
+  - Flat
+  - N/A
+* - Global
+  - Global
+  - Global
+* - Local
+  - LDS
+  - LDS/Shared
+* - Private
+  - Scratch
+  - Private
+* - Constant
+  - Same as global
+  - Constant
+```
+
+Below is a high-level description of the address spaces in the AMDGPU backend of LLVM:
+
+```{list-table}
+:header-rows: 1
+:widths: 20 80
+:class: noscroll-table
+
+* - Address space
+  - Description
+* - Global
+  - Memory that can be seen by all threads in a process, and may be backed by the local accelerator's HBM, a remote accelerator's HBM, or the CPU's DRAM.
+* - Local
+  - Memory that is only visible to a particular workgroup.  On AMD's Instinct(tm) accelerator hardware, this is stored in [LDS](LDS) memory.
+* - Private
+  - Memory that is only visible to a particular [work-item](workitem) (thread), stored in the scratch space on AMD's Instinct(tm) accelerators.
+* - Constant
+  - Read-only memory that is in the global address space and stored on the local accelerator's HBM.
+* - Generic
+  - Used when the compiler cannot statically prove that a pointer is addressing memory in a single (non-generic) address space. Mapped to Flat on AMD's Instinct(tm) accelerators, the pointer could dynamically address global, local, private or constant memory.
+```
+
+[LLVM's documentation for AMDGPU Backend](https://llvm.org/docs/AMDGPUUsage.html#address-spaces) will always have the most up-to-date information, and the interested reader is referred to this source for a more complete explanation.
+
+(Mtype)=
+## Memory Type
+
+AMD Instinct(tm) accelerators contain a number of different memory allocation types to enable the HIP language's [memory coherency model](https://rocm.docs.amd.com/projects/HIP/en/latest/user_guide/programming_manual.html#coherency-controls).
+These memory types are broadly similar between AMD Instinct(tm) accelerator generations, but may differ in exact implementation.
+
+In addition, these memory types _may_ differ between accelerators on the same system, even when accessing the same memory allocation.
+For example, an [MI2XX](2xxnote) accelerator accessing "fine-grained" memory allocated local to that device may see the allocation as coherently cachable, while a remote accelerator might see the same allocation as uncached.
+
+These memory types include:
+
+```{list-table}
+:header-rows: 1
+:widths: 20 80
+:class: noscroll-table
+  * - Memory type
+    - Description
+  * - Uncached Memory (UC)
+    - Memory that will not be cached in this accelerator.  On [MI2XX](2xxnote) accelerators, this corresponds "fine-grained" (a.k.a., "coherent") memory allocated on a remote accelerator or the host, e.g., using `hipHostMalloc` or `hipMallocManaged` with default allocation flags.
+  * - Non-hardware-Coherent Memory (NC)
+    - Memory that will be cached by the accelerator, and is only guaranteed to be consistent at kernel boundaries / after software-driven synchronization events. On [MI2XX](2xxnote) accelerators, this type of memory maps to (e.g.,) "coarse-grained" `hipHostMalloc`'d memory (i.e., allocated with the `hipHostMallocNonCoherent` flag), or `hipMalloc`'d memory allocated on a remote accelerator.
+  * - Coherently Cachable (CC)
+    - Memory for which only reads from the accelerator where the memory was allocated will be cached.  Writes to CC memory are uncached, and trigger invalidations of any line within this accelerator. On [MI2XX](2xxnote) accelerators, this type of memory maps to "fine-grained" memory allocated on the local accelerator using, e.g., the `hipExtMallocWithFlags` API using the `hipDeviceMallocFinegrained` flag.
+  * - Read/Write Coherent Memory (RW)
+    - Memory that will be cached by the accelerator, but may be invalidated by writes from remote devices at kernel boundaries / after software-driven synchronization events. On [MI2XX](2xxnote) accelerators, this corresponds to "coarse-grained" memory allocated locally to the accelerator, using e.g., the default `hipMalloc` allocator.
+```
+
+A good discussion of coarse and fine grained memory allocations and what type of memory is returned by various combinations of memory allocators, flags and arguments can be found in the [Crusher Quick-Start Guide](https://docs.olcf.ornl.gov/systems/crusher_quick_start_guide.html#floating-point-fp-atomic-operations-and-coarse-fine-grained-memory-allocations). 
+
+(profiling-with-omniperf)=
+# Profiling with Omniperf by Example
+
+(VALU_inst_mix_example)=
+## VALU Arithmetic Instruction Mix
+
+For this example, we consider the [instruction mix sample](https://github.com/AMDResearch/omniperf/blob/dev/sample/instmix.hip) distributed as a part of Omniperf.
+
+```{note}
+This example is expected to work on all CDNA accelerators, however the results in this section were collected on an [MI2XX](2xxnote) accelerator
+```
+
+### Design note
+
+This code uses a number of inline assembly instructions to cleanly identify the types of instructions being issued, as well as to avoid optimization / dead-code elimination by the compiler.
+While inline assembly is inherently unportable, this example is expected to work on all GCN GPUs and CDNA accelerators.
+
+We reproduce a sample of the kernel below:
+
+```c++
+  // fp32: add, mul, transcendental and fma
+  float f1, f2;
+  asm volatile(
+      "v_add_f32_e32 %0, %1, %0\n"
+      "v_mul_f32_e32 %0, %1, %0\n"
+      "v_sqrt_f32 %0, %1\n"
+      "v_fma_f32 %0, %1, %0, %1\n"
+      : "=v"(f1)
+      : "v"(f2));
+```
+
+These instructions correspond to:
+  - A 32-bit floating point addition,
+  - A 32-bit floating point multiplication,
+  - A 32-bit floating point square-root transcendental operation, and
+  - A 32-bit floating point fused multiply-add operation.
+
+For more detail, the reader is referred to (e.g.,) the [CDNA2 ISA Guide](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf).
+
+### Instruction mix
+
+This example was compiled and run on a MI250 accelerator using ROCm v5.6.0, and Omniperf v2.0.0.
+```shell-session
+$ hipcc -O3 instmix.hip -o instmix
+```
+
+We generate our profile for this example via:
+```shell-session
+$ omniperf profile -n instmix --no-roof -- ./instmix
+```
+
+and finally, analyze the instruction mix section:
+```shell-session
+$ omniperf analyze -p workloads/instmix/mi200/ -b 10.2
+<...>
+10. Compute Units - Instruction Mix
+10.2 VALU Arithmetic Instr Mix
+╒═════════╤════════════╤═════════╤════════════════╕
+│ Index   │ Metric     │   Count │ Unit           │
+╞═════════╪════════════╪═════════╪════════════════╡
+│ 10.2.0  │ INT32      │    1.00 │ Instr per wave │
+├─────────┼────────────┼─────────┼────────────────┤
+│ 10.2.1  │ INT64      │    1.00 │ Instr per wave │
+├─────────┼────────────┼─────────┼────────────────┤
+│ 10.2.2  │ F16-ADD    │    1.00 │ Instr per wave │
+├─────────┼────────────┼─────────┼────────────────┤
+│ 10.2.3  │ F16-MUL    │    1.00 │ Instr per wave │
+├─────────┼────────────┼─────────┼────────────────┤
+│ 10.2.4  │ F16-FMA    │    1.00 │ Instr per wave │
+├─────────┼────────────┼─────────┼────────────────┤
+│ 10.2.5  │ F16-Trans  │    1.00 │ Instr per wave │
+├─────────┼────────────┼─────────┼────────────────┤
+│ 10.2.6  │ F32-ADD    │    1.00 │ Instr per wave │
+├─────────┼────────────┼─────────┼────────────────┤
+│ 10.2.7  │ F32-MUL    │    1.00 │ Instr per wave │
+├─────────┼────────────┼─────────┼────────────────┤
+│ 10.2.8  │ F32-FMA    │    1.00 │ Instr per wave │
+├─────────┼────────────┼─────────┼────────────────┤
+│ 10.2.9  │ F32-Trans  │    1.00 │ Instr per wave │
+├─────────┼────────────┼─────────┼────────────────┤
+│ 10.2.10 │ F64-ADD    │    1.00 │ Instr per wave │
+├─────────┼────────────┼─────────┼────────────────┤
+│ 10.2.11 │ F64-MUL    │    1.00 │ Instr per wave │
+├─────────┼────────────┼─────────┼────────────────┤
+│ 10.2.12 │ F64-FMA    │    1.00 │ Instr per wave │
+├─────────┼────────────┼─────────┼────────────────┤
+│ 10.2.13 │ F64-Trans  │    1.00 │ Instr per wave │
+├─────────┼────────────┼─────────┼────────────────┤
+│ 10.2.14 │ Conversion │    1.00 │ Instr per wave │
+╘═════════╧════════════╧═════════╧════════════════╛
+```
+
+shows that we have exactly one of each type of VALU arithmetic instruction, by construction!
+
+(Fabric_transactions_example)=
+## Infinity-Fabric(tm) transactions
+
+For this example, we consider the [Infinity Fabric(tm) sample](https://github.com/AMDResearch/omniperf/blob/dev/sample/fabric.hip) distributed as a part of Omniperf.
+This code launches a simple read-only kernel, e.g.:
+
+```c++
+// the main streaming kernel
+__global__ void kernel(int* x, size_t N, int zero) {
+  int sum = 0;
+  const size_t offset_start = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = 0; i < 10; ++i) {
+    for (size_t offset = offset_start; offset < N; offset += blockDim.x * gridDim.x) {
+      sum += x[offset];
+    }
+  }
+  if (sum != 0) {
+    x[offset_start] = sum;
+  }
+}
+```
+
+twice; once as a warmup, and once for analysis.
+We note that the buffer `x` is initialized to all zeros via a call to `hipMemcpy` on the host before the kernel is ever launched, therefore the conditional:
+
+```c++
+if (sum != 0) { ...
+```
+
+is identically false (and thus: we expect no writes).
+
+```{note}
+The actual sample included with Omniperf also includes the ability to select different operation types, e.g., atomics, writes, etc.
+This abbreviated version is presented here for reference only.
+```
+
+Finally, this sample code lets the user control:
+  - The [granularity of an allocation](Mtype),
+  - The owner of an allocation (local HBM, CPU DRAM or remote HBM), and
+  - The size of an allocation (the default is $\sim4$GiB)
+
+via command line arguments.
+In doing so, we can explore the impact of these parameters on the L2-Fabric metrics reported by Omniperf to further understand their meaning.
+
+All results in this section were generated an a node of Infinity Fabric(tm) connected MI250 accelerators using ROCm v5.6.0, and Omniperf v2.0.0.
+Although results may vary with ROCm versions and accelerator connectivity, we expect the lessons learned here to be broadly applicable.
+
+(Fabric_exp_1)=
+### Experiment #1 - Coarse-grained, accelerator-local HBM reads
+
+In our first experiment, we consider the simplest possible case, a `hipMalloc`'d buffer that is local to our current accelerator:
+
+```shell-session
+$ omniperf profile -n coarse_grained_local --no-roof -- ./fabric -t 1 -o 0
+Using:
+  mtype:CoarseGrained
+  mowner:Device
+  mspace:Global
+  mop:Read
+  mdata:Unsigned
+  remoteId:-1
+<...>
+$ omniperf analyze -p workloads/coarse_grained_local/mi200 -b 17.2.0 17.2.1 17.2.2 17.4.0 17.4.1 17.4.2 17.5.0 17.5.1 17.5.2 17.5.3 17.5.4 -n per_kernel --dispatch 2
+<...>
+17. L2 Cache
+17.2 L2 - Fabric Transactions
+╒═════════╤═════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕
+│ Index   │ Metric              │            Avg │            Min │            Max │ Unit             │
+╞═════════╪═════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡
+│ 17.2.0  │ L2-Fabric Read BW   │ 42947428672.00 │ 42947428672.00 │ 42947428672.00 │ Bytes per kernel │
+├─────────┼─────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.1  │ HBM Read Traffic    │         100.00 │         100.00 │         100.00 │ Pct              │
+├─────────┼─────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.2  │ Remote Read Traffic │           0.00 │           0.00 │           0.00 │ Pct              │
+╘═════════╧═════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛
+17.4 L2 - Fabric Interface Stalls
+╒═════════╤═══════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕
+│ Index   │ Metric                        │ Type                   │ Transaction   │   Avg │   Min │   Max │ Unit   │
+╞═════════╪═══════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡
+│ 17.4.0  │ Read - PCIe Stall             │ PCIe Stall             │ Read          │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤
+│ 17.4.1  │ Read - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Read          │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤
+│ 17.4.2  │ Read - HBM Stall              │ HBM Stall              │ Read          │  0.07 │  0.07 │  0.07 │ Pct    │
+╘═════════╧═══════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛
+17.5 L2 - Fabric Detailed Transaction Breakdown
+╒═════════╤═════════════════╤══════════════╤══════════════╤══════════════╤════════════════╕
+│ Index   │ Metric          │          Avg │          Min │          Max │ Unit           │
+╞═════════╪═════════════════╪══════════════╪══════════════╪══════════════╪════════════════╡
+│ 17.5.0  │ Read (32B)      │         0.00 │         0.00 │         0.00 │ Req per kernel │
+├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤
+│ 17.5.1  │ Read (Uncached) │      1450.00 │      1450.00 │      1450.00 │ Req per kernel │
+├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤
+│ 17.5.2  │ Read (64B)      │ 671053573.00 │ 671053573.00 │ 671053573.00 │ Req per kernel │
+├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤
+│ 17.5.3  │ HBM Read        │ 671053565.00 │ 671053565.00 │ 671053565.00 │ Req per kernel │
+├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤
+│ 17.5.4  │ Remote Read     │         8.00 │         8.00 │         8.00 │ Req per kernel │
+╘═════════╧═════════════════╧══════════════╧══════════════╧══════════════╧════════════════╛
+```
+
+Here, we see:
+  - The vast majority of L2-Fabric requests (>99%) are 64B read requests (17.5.2)
+  - Nearly 100% of the read requests (17.2.1) are homed in on the accelerator-local HBM (17.5.3), while some small fraction of these reads are routed to a "remote" device (17.5.4)
+  - These drive a $\sim40$GiB per kernel read-bandwidth (17.2.0)
+
+In addition, we see a small amount of [uncached](Mtype) reads (17.5.1), these correspond to things like:
+  - the assembly code to execute the kernel
+  - kernel arguments
+  - coordinate parameters (e.g., blockDim.z) that were not initialized by the hardware, etc.
+and may account for some of our 'remote' read requests (17.5.4), e.g., reading from CPU DRAM.
+
+The above list is not exhaustive, nor are all of these guaranteed to be 'uncached' -- the exact implementation depends on the accelerator and ROCm versions used.
+These read requests could be interrogated further in the [Scalar L1 Data Cache](sL1D) and [Instruction Cache](L1I) metric sections.
+
+```{note}
+The Traffic metrics in Sec 17.2 are presented as a percentage of the total number of requests, e.g. 'HBM Read Traffic' is the percent of read requests (17.5.0-17.5.2) that were directed to the accelerators' local HBM (17.5.3).
+```
+
+(Fabric_exp_2)=
+### Experiment #2 - Fine-grained, accelerator-local HBM reads
+
+In this experiment, we change the [granularity](Mtype) of our device-allocation to be fine-grained device memory, local to the current accelerator.
+Our code uses the `hipExtMallocWithFlag` API with the `hipDeviceMallocFinegrained` flag to accomplish this.
+
+```{note}
+On some systems (e.g., those with only PCIe(r) connected accelerators), you need to set the environment variable `HSA_FORCE_FINE_GRAIN_PCIE=1` to enable this memory type.
+```
+
+```shell-session
+$ omniperf profile -n fine_grained_local --no-roof -- ./fabric -t 0 -o 0
+Using:
+  mtype:FineGrained
+  mowner:Device
+  mspace:Global
+  mop:Read
+  mdata:Unsigned
+  remoteId:-1
+<...>
+$ omniperf analyze -p workloads/fine_grained_local/mi200 -b 17.2.0 17.2.1 17.2.2 17.2.3 17.4.0 17.4.1 17.4.2 17.5.0 17.5.1 17.5.2 17.5.3 17.5.4  -n per_kernel --dispatch 2
+<...>
+17. L2 Cache
+17.2 L2 - Fabric Transactions
+╒═════════╤═══════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕
+│ Index   │ Metric                │            Avg │            Min │            Max │ Unit             │
+╞═════════╪═══════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡
+│ 17.2.0  │ L2-Fabric Read BW     │ 42948661824.00 │ 42948661824.00 │ 42948661824.00 │ Bytes per kernel │
+├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.1  │ HBM Read Traffic      │         100.00 │         100.00 │         100.00 │ Pct              │
+├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.2  │ Remote Read Traffic   │           0.00 │           0.00 │           0.00 │ Pct              │
+├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.3  │ Uncached Read Traffic │           0.00 │           0.00 │           0.00 │ Pct              │
+╘═════════╧═══════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛
+17.4 L2 - Fabric Interface Stalls
+╒═════════╤═══════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕
+│ Index   │ Metric                        │ Type                   │ Transaction   │   Avg │   Min │   Max │ Unit   │
+╞═════════╪═══════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡
+│ 17.4.0  │ Read - PCIe Stall             │ PCIe Stall             │ Read          │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤
+│ 17.4.1  │ Read - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Read          │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤
+│ 17.4.2  │ Read - HBM Stall              │ HBM Stall              │ Read          │  0.07 │  0.07 │  0.07 │ Pct    │
+╘═════════╧═══════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛
+17.5 L2 - Fabric Detailed Transaction Breakdown
+╒═════════╤═════════════════╤══════════════╤══════════════╤══════════════╤════════════════╕
+│ Index   │ Metric          │          Avg │          Min │          Max │ Unit           │
+╞═════════╪═════════════════╪══════════════╪══════════════╪══════════════╪════════════════╡
+│ 17.5.0  │ Read (32B)      │         0.00 │         0.00 │         0.00 │ Req per kernel │
+├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤
+│ 17.5.1  │ Read (Uncached) │      1334.00 │      1334.00 │      1334.00 │ Req per kernel │
+├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤
+│ 17.5.2  │ Read (64B)      │ 671072841.00 │ 671072841.00 │ 671072841.00 │ Req per kernel │
+├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤
+│ 17.5.3  │ HBM Read        │ 671072835.00 │ 671072835.00 │ 671072835.00 │ Req per kernel │
+├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤
+│ 17.5.4  │ Remote Read     │         6.00 │         6.00 │         6.00 │ Req per kernel │
+╘═════════╧═════════════════╧══════════════╧══════════════╧══════════════╧════════════════╛
+```
+
+Comparing with our [previous example](Fabric_exp_1), we see a relatively similar result, namely:
+  - The vast majority of L2-Fabric requests are 64B read requests (17.5.2)
+  - Nearly all these read requests are directed to the accelerator-local HBM (17.2.1)
+
+In addition, we now see a small percentage of HBM Read Stalls (17.4.2), as streaming fine-grained memory is putting more stress on Infinity Fabric(tm).
+
+```{note}
+The stalls in Sec 17.4 are presented as a percentage of the total number active L2 cycles, summed over [all L2 channels](L2).
+```
+
+(Fabric_exp_3)=
+### Experiment #3 - Fine-grained, remote-accelerator HBM reads
+
+In this experiment, we move our [fine-grained](Mtype) allocation to be owned by a remote accelerator.
+We accomplish this by first changing the HIP device using e.g., `hipSetDevice(1)` API, then allocating fine-grained memory (as described [previously](Fabric_exp_2)), and finally resetting the device back to the default, e.g., `hipSetDevice(0)`.
+
+Although we have not changed our code significantly, we do see a substantial change in the L2-Fabric metrics:
+
+```shell-session
+$ omniperf profile -n fine_grained_remote --no-roof -- ./fabric -t 0 -o 2
+Using:
+  mtype:FineGrained
+  mowner:Remote
+  mspace:Global
+  mop:Read
+  mdata:Unsigned
+  remoteId:-1
+<...>
+$ omniperf analyze -p workloads/fine_grained_remote/mi200 -b 17.2.0 17.2.1 17.2.2 17.2.3 17.4.0 17.4.1 17.4.2 17.5.0 17.5.1 17.5.2 17.5.3 17.5.4  -n per_kernel --dispatch 2
+<...>
+17. L2 Cache
+17.2 L2 - Fabric Transactions
+╒═════════╤═══════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕
+│ Index   │ Metric                │            Avg │            Min │            Max │ Unit             │
+╞═════════╪═══════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡
+│ 17.2.0  │ L2-Fabric Read BW     │ 42949692736.00 │ 42949692736.00 │ 42949692736.00 │ Bytes per kernel │
+├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.1  │ HBM Read Traffic      │           0.00 │           0.00 │           0.00 │ Pct              │
+├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.2  │ Remote Read Traffic   │         100.00 │         100.00 │         100.00 │ Pct              │
+├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.3  │ Uncached Read Traffic │         200.00 │         200.00 │         200.00 │ Pct              │
+╘═════════╧═══════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛
+17.4 L2 - Fabric Interface Stalls
+╒═════════╤═══════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕
+│ Index   │ Metric                        │ Type                   │ Transaction   │   Avg │   Min │   Max │ Unit   │
+╞═════════╪═══════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡
+│ 17.4.0  │ Read - PCIe Stall             │ PCIe Stall             │ Read          │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤
+│ 17.4.1  │ Read - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Read          │ 17.85 │ 17.85 │ 17.85 │ Pct    │
+├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤
+│ 17.4.2  │ Read - HBM Stall              │ HBM Stall              │ Read          │  0.00 │  0.00 │  0.00 │ Pct    │
+╘═════════╧═══════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛
+17.5 L2 - Fabric Detailed Transaction Breakdown
+╒═════════╤═════════════════╤═══════════════╤═══════════════╤═══════════════╤════════════════╕
+│ Index   │ Metric          │           Avg │           Min │           Max │ Unit           │
+╞═════════╪═════════════════╪═══════════════╪═══════════════╪═══════════════╪════════════════╡
+│ 17.5.0  │ Read (32B)      │          0.00 │          0.00 │          0.00 │ Req per kernel │
+├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤
+│ 17.5.1  │ Read (Uncached) │ 1342177894.00 │ 1342177894.00 │ 1342177894.00 │ Req per kernel │
+├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤
+│ 17.5.2  │ Read (64B)      │  671088949.00 │  671088949.00 │  671088949.00 │ Req per kernel │
+├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤
+│ 17.5.3  │ HBM Read        │        307.00 │        307.00 │        307.00 │ Req per kernel │
+├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤
+│ 17.5.4  │ Remote Read     │  671088642.00 │  671088642.00 │  671088642.00 │ Req per kernel │
+╘═════════╧═════════════════╧═══════════════╧═══════════════╧═══════════════╧════════════════╛
+```
+
+First, we see that while we still observe approximately the same number of 64B Read Requests (17.5.2), we now see an even larger number of Uncached Read Requests (17.5.3).  Some simple division reveals:
+```math
+342177894.00 / 671088949.00 ≈ 2
+```
+That is, each 64B Read Request is _also_ counted as two Uncached Read Requests, as reflected in the [request-flow diagram](fabric-fig).
+This is also why the Uncached Read Traffic metric (17.2.3) is at the counter-intuitive value of 200%!
+
+In addition, we also observe that:
+  - we no longer see any significant number of HBM Read Requests (17.2.1, 17.5.3), nor HBM Read Stalls (17.4.2), but instead
+  - we observe that almost all of these requests are considered "remote" (17.2.2, 17.5.4) are being routed to another accelerator, or the CPU --- in this case HIP  Device 1 --- and
+  - we observe a significantly larger percentage of AMD Infinity Fabric(tm) Read Stalls (17.4.1) as compared to the HBM Read Stalls in the [previous example](Fabric_exp_2)
+
+These stalls correspond to reads that are going out over the AMD Infinity Fabric(tm) connection to another MI250 accelerator.
+In addition, because these are crossing between accelerators, we expect significantly lower achievable bandwidths as compared to the local accelerator's HBM -- this is reflected (indirectly) in the magnitude of the stall metric (17.4.1).
+Finally, we note that if our system contained only PCIe(r) connected accelerators, these observations will differ.
+
+(Fabric_exp_4)=
+### Experiment #4 - Fine-grained, CPU-DRAM reads
+
+In this experiment, we move our [fine-grained](Mtype) allocation to be owned by the CPU's DRAM.
+We accomplish this by allocating host-pinned fine-grained memory using the `hipHostMalloc` API:
+
+```shell-session
+$ omniperf profile -n fine_grained_host --no-roof -- ./fabric -t 0 -o 1
+Using:
+  mtype:FineGrained
+  mowner:Host
+  mspace:Global
+  mop:Read
+  mdata:Unsigned
+  remoteId:-1
+<...>
+$ omniperf analyze -p workloads/fine_grained_host/mi200 -b 17.2.0 17.2.1 17.2.2 17.2.3 17.4.0 17.4.1 17.4.2 17.5.0 17.5.1 17.5.2 17.5.3 17.5.4  -n per_kernel --dispatch 2
+<...>
+17. L2 Cache
+17.2 L2 - Fabric Transactions
+╒═════════╤═══════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕
+│ Index   │ Metric                │            Avg │            Min │            Max │ Unit             │
+╞═════════╪═══════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡
+│ 17.2.0  │ L2-Fabric Read BW     │ 42949691264.00 │ 42949691264.00 │ 42949691264.00 │ Bytes per kernel │
+├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.1  │ HBM Read Traffic      │           0.00 │           0.00 │           0.00 │ Pct              │
+├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.2  │ Remote Read Traffic   │         100.00 │         100.00 │         100.00 │ Pct              │
+├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.3  │ Uncached Read Traffic │         200.00 │         200.00 │         200.00 │ Pct              │
+╘═════════╧═══════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛
+17.4 L2 - Fabric Interface Stalls
+╒═════════╤═══════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕
+│ Index   │ Metric                        │ Type                   │ Transaction   │   Avg │   Min │   Max │ Unit   │
+╞═════════╪═══════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡
+│ 17.4.0  │ Read - PCIe Stall             │ PCIe Stall             │ Read          │ 91.29 │ 91.29 │ 91.29 │ Pct    │
+├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤
+│ 17.4.1  │ Read - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Read          │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤
+│ 17.4.2  │ Read - HBM Stall              │ HBM Stall              │ Read          │  0.00 │  0.00 │  0.00 │ Pct    │
+╘═════════╧═══════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛
+17.5 L2 - Fabric Detailed Transaction Breakdown
+╒═════════╤═════════════════╤═══════════════╤═══════════════╤═══════════════╤════════════════╕
+│ Index   │ Metric          │           Avg │           Min │           Max │ Unit           │
+╞═════════╪═════════════════╪═══════════════╪═══════════════╪═══════════════╪════════════════╡
+│ 17.5.0  │ Read (32B)      │          0.00 │          0.00 │          0.00 │ Req per kernel │
+├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤
+│ 17.5.1  │ Read (Uncached) │ 1342177848.00 │ 1342177848.00 │ 1342177848.00 │ Req per kernel │
+├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤
+│ 17.5.2  │ Read (64B)      │  671088926.00 │  671088926.00 │  671088926.00 │ Req per kernel │
+├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤
+│ 17.5.3  │ HBM Read        │        284.00 │        284.00 │        284.00 │ Req per kernel │
+├─────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────────┤
+│ 17.5.4  │ Remote Read     │  671088642.00 │  671088642.00 │  671088642.00 │ Req per kernel │
+╘═════════╧═════════════════╧═══════════════╧═══════════════╧═══════════════╧════════════════╛
+```
+
+Here we see _almost_ the same results as in the [previous experiment](Fabric_exp_3), however now as we are crossing a PCIe(r) bus to the CPU, we see that the Infinity Fabric(tm) Read stalls (17.4.1) have shifted to be a PCIe(r) stall (17.4.2).
+In addition, as (on this system) the PCIe(r) bus has a lower peak bandwidth than the AMD Infinity Fabric(TM) connection between two accelerators, we once again observe an increase in the percentage of stalls on this interface.
+
+```{note}
+Had we performed this same experiment on a [MI250X system](https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf), these transactions would again have been marked as Infinity Fabric(tm) Read stalls (17.4.1), as the CPU is connected to the accelerator via AMD Infinity Fabric.
+```
+
+(Fabric_exp_5)=
+### Experiment #5 - Coarse-grained, CPU-DRAM reads
+
+In our next fabric experiment, we change our CPU memory allocation to be [coarse-grained](Mtype).
+We accomplish this by passing the `hipHostMalloc` API the `hipHostMallocNonCoherent` flag, to mark the allocation as coarse-grained:
+
+```shell-session
+$ omniperf profile -n coarse_grained_host --no-roof -- ./fabric -t 1 -o 1
+Using:
+  mtype:CoarseGrained
+  mowner:Host
+  mspace:Global
+  mop:Read
+  mdata:Unsigned
+  remoteId:-1
+<...>
+$ omniperf analyze -p workloads/coarse_grained_host/mi200 -b 17.2.0 17.2.1 17.2.2 17.2.3 17.4.0 17.4.1 17.4.2 17.5.0 17.5.1 17.5.2 17.5.3 17.5.4  -n per_kernel --dispatch 2
+<...>
+17. L2 Cache
+17.2 L2 - Fabric Transactions
+╒═════════╤═══════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕
+│ Index   │ Metric                │            Avg │            Min │            Max │ Unit             │
+╞═════════╪═══════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡
+│ 17.2.0  │ L2-Fabric Read BW     │ 42949691264.00 │ 42949691264.00 │ 42949691264.00 │ Bytes per kernel │
+├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.1  │ HBM Read Traffic      │           0.00 │           0.00 │           0.00 │ Pct              │
+├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.2  │ Remote Read Traffic   │         100.00 │         100.00 │         100.00 │ Pct              │
+├─────────┼───────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.3  │ Uncached Read Traffic │           0.00 │           0.00 │           0.00 │ Pct              │
+╘═════════╧═══════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛
+17.4 L2 - Fabric Interface Stalls
+╒═════════╤═══════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕
+│ Index   │ Metric                        │ Type                   │ Transaction   │   Avg │   Min │   Max │ Unit   │
+╞═════════╪═══════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡
+│ 17.4.0  │ Read - PCIe Stall             │ PCIe Stall             │ Read          │ 91.27 │ 91.27 │ 91.27 │ Pct    │
+├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤
+│ 17.4.1  │ Read - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Read          │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼───────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤
+│ 17.4.2  │ Read - HBM Stall              │ HBM Stall              │ Read          │  0.00 │  0.00 │  0.00 │ Pct    │
+╘═════════╧═══════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛
+17.5 L2 - Fabric Detailed Transaction Breakdown
+╒═════════╤═════════════════╤══════════════╤══════════════╤══════════════╤════════════════╕
+│ Index   │ Metric          │          Avg │          Min │          Max │ Unit           │
+╞═════════╪═════════════════╪══════════════╪══════════════╪══════════════╪════════════════╡
+│ 17.5.0  │ Read (32B)      │         0.00 │         0.00 │         0.00 │ Req per kernel │
+├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤
+│ 17.5.1  │ Read (Uncached) │       562.00 │       562.00 │       562.00 │ Req per kernel │
+├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤
+│ 17.5.2  │ Read (64B)      │ 671088926.00 │ 671088926.00 │ 671088926.00 │ Req per kernel │
+├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤
+│ 17.5.3  │ HBM Read        │       281.00 │       281.00 │       281.00 │ Req per kernel │
+├─────────┼─────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤
+│ 17.5.4  │ Remote Read     │ 671088645.00 │ 671088645.00 │ 671088645.00 │ Req per kernel │
+╘═════════╧═════════════════╧══════════════╧══════════════╧══════════════╧════════════════╛
+```
+
+Here we see a similar result to our [previous experiment](Fabric_exp_4), with one key difference: our accesses are no longer marked as Uncached Read requests (17.2.3, 17.5.1), but instead are 64B read requests (17.5.2), as observed in our [Coarse-grained, accelerator-local HBM](Fabric_exp_1) experiment.
+
+(Fabric_exp_6)=
+### Experiment #6 - Fine-grained, CPU-DRAM writes
+
+Thus far in our exploration of the L2-Fabric interface, we have primarily focused on read operations.
+However, in [our request flow diagram](fabric-fig), we note that writes are counted separately.
+To obeserve this, we use the '-p' flag to trigger write operations to fine-grained memory allocated on the host:
+
+```shell-session
+$ omniperf profile -n fine_grained_host_write --no-roof -- ./fabric -t 0 -o 1 -p 1
+Using:
+  mtype:FineGrained
+  mowner:Host
+  mspace:Global
+  mop:Write
+  mdata:Unsigned
+  remoteId:-1
+<...>
+$ omniperf analyze -p workloads/fine_grained_host_writes/mi200 -b 17.2.4 17.2.5 17.2.6 17.2.7 17.2.8 17.4.3 17.4.4 17.4.5 17.4.6 17.5.5 17.5.6 17.5.7 17.5.8 17.5.9 17.5.10 -n per_kernel --dispatch 2
+<...>
+17. L2 Cache
+17.2 L2 - Fabric Transactions
+╒═════════╤═══════════════════════════════════╤════════════════╤════════════════╤════════════════╤══════════════════╕
+│ Index   │ Metric                            │            Avg │            Min │            Max │ Unit             │
+╞═════════╪═══════════════════════════════════╪════════════════╪════════════════╪════════════════╪══════════════════╡
+│ 17.2.4  │ L2-Fabric Write and Atomic BW     │ 42949672960.00 │ 42949672960.00 │ 42949672960.00 │ Bytes per kernel │
+├─────────┼───────────────────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.5  │ HBM Write and Atomic Traffic      │           0.00 │           0.00 │           0.00 │ Pct              │
+├─────────┼───────────────────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.6  │ Remote Write and Atomic Traffic   │         100.00 │         100.00 │         100.00 │ Pct              │
+├─────────┼───────────────────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.7  │ Atomic Traffic                    │           0.00 │           0.00 │           0.00 │ Pct              │
+├─────────┼───────────────────────────────────┼────────────────┼────────────────┼────────────────┼──────────────────┤
+│ 17.2.8  │ Uncached Write and Atomic Traffic │         100.00 │         100.00 │         100.00 │ Pct              │
+╘═════════╧═══════════════════════════════════╧════════════════╧════════════════╧════════════════╧══════════════════╛
+17.4 L2 - Fabric Interface Stalls
+╒═════════╤════════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕
+│ Index   │ Metric                         │ Type                   │ Transaction   │   Avg │   Min │   Max │ Unit   │
+╞═════════╪════════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡
+│ 17.4.3  │ Write - PCIe Stall             │ PCIe Stall             │ Write         │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤
+│ 17.4.4  │ Write - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Write         │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤
+│ 17.4.5  │ Write - HBM Stall              │ HBM Stall              │ Write         │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤
+│ 17.4.6  │ Write - Credit Starvation      │ Credit Starvation      │ Write         │  0.00 │  0.00 │  0.00 │ Pct    │
+╘═════════╧════════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛
+17.5 L2 - Fabric Detailed Transaction Breakdown
+╒═════════╤═════════════════════════╤══════════════╤══════════════╤══════════════╤════════════════╕
+│ Index   │ Metric                  │          Avg │          Min │          Max │ Unit           │
+╞═════════╪═════════════════════════╪══════════════╪══════════════╪══════════════╪════════════════╡
+│ 17.5.5  │ Write (32B)             │         0.00 │         0.00 │         0.00 │ Req per kernel │
+├─────────┼─────────────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤
+│ 17.5.6  │ Write (Uncached)        │ 671088640.00 │ 671088640.00 │ 671088640.00 │ Req per kernel │
+├─────────┼─────────────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤
+│ 17.5.7  │ Write (64B)             │ 671088640.00 │ 671088640.00 │ 671088640.00 │ Req per kernel │
+├─────────┼─────────────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤
+│ 17.5.8  │ HBM Write and Atomic    │         0.00 │         0.00 │         0.00 │ Req per kernel │
+├─────────┼─────────────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤
+│ 17.5.9  │ Remote Write and Atomic │ 671088640.00 │ 671088640.00 │ 671088640.00 │ Req per kernel │
+├─────────┼─────────────────────────┼──────────────┼──────────────┼──────────────┼────────────────┤
+│ 17.5.10 │ Atomic                  │         0.00 │         0.00 │         0.00 │ Req per kernel │
+╘═════════╧═════════════════════════╧══════════════╧══════════════╧══════════════╧════════════════╛
+```
+
+Here we notice a few changes in our request pattern:
+  - As expected, the requests have changed from 64B Reads to 64B Write requests (17.5.7), 
+  - these requests are homed in on a "remote" destination (17.2.6, 17.5.9), as expected, and,
+  - these are also counted as a single Uncached Write request (17.5.6).
+
+In addition, there rather significant changes in the bandwidth values reported:
+  - the "L2-Fabric Write and Atomic" bandwidth metric (17.2.4) reports about 40GiB of data written across Infinity Fabric(tm) while,
+  - the "Remote Write and Traffic" metric (17.2.5) indicates that nearly 100% of these request are being directed to a remote source
+
+The precise meaning of these metrics will be explored in the [subsequent experiment](Fabric_exp_7).
+
+Finally, we note that we see no write stalls on the PCIe(r) bus (17.4.3). This is because writes over a PCIe(r) bus [are non-posted](https://members.pcisig.com/wg/PCI-SIG/document/10912), i.e., they do not require acknowledgement.
+
+(Fabric_exp_7)=
+### Experiment #7 - Fine-grained, CPU-DRAM atomicAdd
+
+Next, we change our experiment to instead target `atomicAdd` operations to the CPU's DRAM.
+
+```shell-session
+$ omniperf profile -n fine_grained_host_add --no-roof -- ./fabric -t 0 -o 1 -p 2
+Using:
+  mtype:FineGrained
+  mowner:Host
+  mspace:Global
+  mop:Add
+  mdata:Unsigned
+  remoteId:-1
+<...>
+$ omniperf analyze -p workloads/fine_grained_host_add/mi200 -b 17.2.4 17.2.5 17.2.6 17.2.7 17.2.8 17.4.3 17.4.4 17.4.5 17.4.6 17.5.5 17.5.6 17.5.7 17.5.8 17.5.9 17.5.10 -n per_kernel --dispatch 2
+<...>
+17. L2 Cache
+17.2 L2 - Fabric Transactions
+╒═════════╤═══════════════════════════════════╤══════════════╤══════════════╤══════════════╤══════════════════╕
+│ Index   │ Metric                            │          Avg │          Min │          Max │ Unit             │
+╞═════════╪═══════════════════════════════════╪══════════════╪══════════════╪══════════════╪══════════════════╡
+│ 17.2.4  │ L2-Fabric Write and Atomic BW     │ 429496736.00 │ 429496736.00 │ 429496736.00 │ Bytes per kernel │
+├─────────┼───────────────────────────────────┼──────────────┼──────────────┼──────────────┼──────────────────┤
+│ 17.2.5  │ HBM Write and Atomic Traffic      │         0.00 │         0.00 │         0.00 │ Pct              │
+├─────────┼───────────────────────────────────┼──────────────┼──────────────┼──────────────┼──────────────────┤
+│ 17.2.6  │ Remote Write and Atomic Traffic   │       100.00 │       100.00 │       100.00 │ Pct              │
+├─────────┼───────────────────────────────────┼──────────────┼──────────────┼──────────────┼──────────────────┤
+│ 17.2.7  │ Atomic Traffic                    │       100.00 │       100.00 │       100.00 │ Pct              │
+├─────────┼───────────────────────────────────┼──────────────┼──────────────┼──────────────┼──────────────────┤
+│ 17.2.8  │ Uncached Write and Atomic Traffic │       100.00 │       100.00 │       100.00 │ Pct              │
+╘═════════╧═══════════════════════════════════╧══════════════╧══════════════╧══════════════╧══════════════════╛
+17.4 L2 - Fabric Interface Stalls
+╒═════════╤════════════════════════════════╤════════════════════════╤═══════════════╤═══════╤═══════╤═══════╤════════╕
+│ Index   │ Metric                         │ Type                   │ Transaction   │   Avg │   Min │   Max │ Unit   │
+╞═════════╪════════════════════════════════╪════════════════════════╪═══════════════╪═══════╪═══════╪═══════╪════════╡
+│ 17.4.3  │ Write - PCIe Stall             │ PCIe Stall             │ Write         │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤
+│ 17.4.4  │ Write - Infinity Fabric™ Stall │ Infinity Fabric™ Stall │ Write         │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤
+│ 17.4.5  │ Write - HBM Stall              │ HBM Stall              │ Write         │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────┼────────────────────────┼───────────────┼───────┼───────┼───────┼────────┤
+│ 17.4.6  │ Write - Credit Starvation      │ Credit Starvation      │ Write         │  0.00 │  0.00 │  0.00 │ Pct    │
+╘═════════╧════════════════════════════════╧════════════════════════╧═══════════════╧═══════╧═══════╧═══════╧════════╛
+17.5 L2 - Fabric Detailed Transaction Breakdown
+╒═════════╤═════════════════════════╤═════════════╤═════════════╤═════════════╤════════════════╕
+│ Index   │ Metric                  │         Avg │         Min │         Max │ Unit           │
+╞═════════╪═════════════════════════╪═════════════╪═════════════╪═════════════╪════════════════╡
+│ 17.5.5  │ Write (32B)             │ 13421773.00 │ 13421773.00 │ 13421773.00 │ Req per kernel │
+├─────────┼─────────────────────────┼─────────────┼─────────────┼─────────────┼────────────────┤
+│ 17.5.6  │ Write (Uncached)        │ 13421773.00 │ 13421773.00 │ 13421773.00 │ Req per kernel │
+├─────────┼─────────────────────────┼─────────────┼─────────────┼─────────────┼────────────────┤
+│ 17.5.7  │ Write (64B)             │        0.00 │        0.00 │        0.00 │ Req per kernel │
+├─────────┼─────────────────────────┼─────────────┼─────────────┼─────────────┼────────────────┤
+│ 17.5.8  │ HBM Write and Atomic    │        0.00 │        0.00 │        0.00 │ Req per kernel │
+├─────────┼─────────────────────────┼─────────────┼─────────────┼─────────────┼────────────────┤
+│ 17.5.9  │ Remote Write and Atomic │ 13421773.00 │ 13421773.00 │ 13421773.00 │ Req per kernel │
+├─────────┼─────────────────────────┼─────────────┼─────────────┼─────────────┼────────────────┤
+│ 17.5.10 │ Atomic                  │ 13421773.00 │ 13421773.00 │ 13421773.00 │ Req per kernel │
+╘═════════╧═════════════════════════╧═════════════╧═════════════╧═════════════╧════════════════╛
+```
+
+In this case, there is quite a lot to unpack:
+  - For the first time, the 32B Write requests (17.5.5) are heavily used.
+  - These correspond to Atomic requests (17.2.7, 17.5.10), and are counted as Uncached Writes (17.5.6).
+  - The L2-Fabric Write and Atomic bandwidth metric (17.2.4) shows about 0.4 GiB of traffic. For convenience, the sample reduces the default problem size for this case due to the speed of atomics across a PCIe(r) bus, and finally,
+  - The traffic is directed to a remote device (17.2.6, 17.5.9)
+
+Let us consider what an "atomic" request means in this context.
+Recall that we are discussing memory traffic flowing from the L2 cache, the device-wide coherence point on current CDNA accelerators such as the MI250, to e.g., the CPU's DRAM.
+In this light, we see that these requests correspond to _system scope_ atomics, and specifically in the case of the MI250, to fine-grained memory!
+
+<!-- Leave as possible future experiment to add
+
+
+### Experiment #2 - Non-temporal writes
+
+If we take the same code (for convenience only) as previously described, we can demonstrate how to achieve 'streaming' writes, as described in the [L2 Cache Access metrics](L2_cache_metrics) section.
+To see this, we use the Clang built-in [`__builtin_nontemporal_store`](https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins), for example
+
+```
+template<typename T>
+__device__ void store (T* ptr, T val) {
+  __builtin_nontemporal_store(val, ptr);
+}
+```
+
+On an AMD [MI2XX](2xxnote) accelerator, for FP32 values this will generate a `global_store_dword` instruction, with the `glc` and `slc` bits set, described in [section 10.1](https://developer.amd.com/wp-content/resources/CDNA2_Shader_ISA_4February2022.pdf) of the CDNA2 ISA guide.
+ -->
+
+## Vector memory operation counting
+
+(flatmembench)=
+### Global / Generic (FLAT)
+
+For this example, we consider the [vector-memory sample](https://github.com/AMDResearch/omniperf/blob/dev/sample/vmem.hip) distributed as a part of Omniperf.
+This code launches many different versions of a simple read/write/atomic-only kernels targeting various address spaces, e.g. below is our simple `global_write` kernel:
+
+```c++
+// write to a global pointer
+__global__ void global_write(int* ptr, int zero) {
+  ptr[threadIdx.x] = zero;
+}
+```
+
+This example was compiled and run on an MI250 accelerator using ROCm v5.6.0, and Omniperf v2.0.0.
+```shell-session
+$ hipcc -O3 --save-temps vmem.hip -o vmem
+```
+We have also chosen to include the `--save-temps` flag to save the compiler temporary files, such as the generated CDNA assembly code, for inspection.
+
+Finally, we generate our omniperf profile as:
+```shell-session
+$ omniperf profile -n vmem --no-roof -- ./vmem
+```
+
+(Flat_design)=
+#### Design note
+
+We should explain some of the more peculiar line(s) of code in our example, e.g., the use of compiler builtins and explicit address space casting, etc.
+```c++
+// write to a generic pointer
+typedef int __attribute__((address_space(0)))* generic_ptr;
+
+__attribute__((noinline)) __device__ void generic_store(generic_ptr ptr, int zero) { *ptr = zero; }
+
+__global__ void generic_write(int* ptr, int zero, int filter) {
+  __shared__ int lds[1024];
+  int* generic = (threadIdx.x < filter) ? &ptr[threadIdx.x] : &lds[threadIdx.x];
+  generic_store((generic_ptr)generic, zero);
+}
+```
+
+One of our aims in this example is to demonstrate the use of the ['generic' (a.k.a., FLAT)](https://llvm.org/docs/AMDGPUUsage.html#address-space-identifier) address space.
+This address space is typically used when the compiler cannot statically prove where the backing memory is located.
+
+To try to _force_ the compiler to use this address space, we have applied `__attribute__((noinline))` to the `generic_store` function to have the compiler treat it as a function call (i.e., on the other-side of which, the address space may not be known).
+However, in a trivial example such as this, the compiler may choose to specialize the `generic_store` function to the two address spaces that may provably be used from our translation-unit, i.e., ['local' (a.k.a., LDS)](Mspace) and ['global'](Mspace).  Hence, we forcibly cast the address space to ['generic' (i.e., FLAT)](Mspace) to avoid this compiler optimization.
+
+```{warning}
+While convenient for our example here, this sort of explicit address space casting can lead to strange compilation errors, and in the worst cases, incorrect results and thus use is discouraged in production code.
+```
+
+For more details on address spaces, the reader is referred to the [address-space section](Mspace).
+
+#### Global Write
+
+First, we demonstrate our simple `global_write` kernel:
+```shell-session
+$ omniperf analyze -p workloads/vmem/mi200/ --dispatch 1 -b 10.3 15.1.4 15.1.5 15.1.6 15.1.7 15.1.8 15.1.9 15.1.10 15.1.11  -n per_kernel
+<...>
+--------------------------------------------------------------------------------
+0. Top Stat
+╒════╤═════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕
+│    │ KernelName                          │   Count │   Sum(ns) │   Mean(ns) │   Median(ns) │    Pct │
+╞════╪═════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡
+│  0 │ global_write(int*, int) [clone .kd] │    1.00 │   2400.00 │    2400.00 │      2400.00 │ 100.00 │
+╘════╧═════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛
+
+
+--------------------------------------------------------------------------------
+10. Compute Units - Instruction Mix
+10.3 VMEM Instr Mix
+╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕
+│ Index   │ Metric                │   Avg │   Min │   Max │ Unit             │
+╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡
+│ 10.3.0  │ Global/Generic Instr  │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.1  │ Global/Generic Read   │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.2  │ Global/Generic Write  │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.3  │ Global/Generic Atomic │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.4  │ Spill/Stack Instr     │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.5  │ Spill/Stack Read      │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.6  │ Spill/Stack Write     │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.7  │ Spill/Stack Atomic    │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛
+
+
+--------------------------------------------------------------------------------
+15. Address Processing Unit and Data Return Path (TA/TD)
+15.1 Address Processing Unit
+╒═════════╤═════════════════════════════╤═══════╤═══════╤═══════╤══════════════════╕
+│ Index   │ Metric                      │   Avg │   Min │   Max │ Unit             │
+╞═════════╪═════════════════════════════╪═══════╪═══════╪═══════╪══════════════════╡
+│ 15.1.4  │ Total Instructions          │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 15.1.5  │ Global/Generic Instr        │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 15.1.6  │ Global/Generic Read Instr   │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 15.1.7  │ Global/Generic Write Instr  │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 15.1.8  │ Global/Generic Atomic Instr │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 15.1.9  │ Spill/Stack Instr           │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 15.1.10 │ Spill/Stack Read Instr      │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼─────────────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 15.1.11 │ Spill/Stack Write Instr     │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+╘═════════╧═════════════════════════════╧═══════╧═══════╧═══════╧══════════════════╛
+```
+
+Here, we have presented both the information in the VMEM Instruction Mix table (10.3) and the Address Processing Unit (15.1).
+We note that this data is expected to be identical, and hence we omit table 15.1 in our subsequent examples.
+
+In addition, as expected, we see a single Global/Generic write instruction (10.3.2, 15.1.7).
+Inspecting the generated assembly:
+
+```asm
+        .protected      _Z12global_writePii     ; -- Begin function _Z12global_writePii
+        .globl  _Z12global_writePii
+        .p2align        8
+        .type   _Z12global_writePii,@function
+_Z12global_writePii:                    ; @_Z12global_writePii
+; %bb.0:
+        s_load_dword s2, s[4:5], 0x8
+        s_load_dwordx2 s[0:1], s[4:5], 0x0
+        v_lshlrev_b32_e32 v0, 2, v0
+        s_waitcnt lgkmcnt(0)
+        v_mov_b32_e32 v1, s2
+        global_store_dword v0, v1, s[0:1]
+        s_endpgm
+        .section        .rodata,#alloc
+        .p2align        6, 0x0
+        .amdhsa_kernel _Z12global_writePii
+```
+
+we see that this corresponds to an instance of a `global_store_dword` operation.
+
+```{note}
+The assembly in these experiments were generated for an [MI2XX](2xxnote) accelerator using ROCm 5.6.0, and may change depending on ROCm versions and the targeted hardware architecture
+```
+
+(Generic_write)=
+#### Generic Write to LDS
+
+Next, we examine a generic write.
+As discussed [previously](Flat_design), our `generic_write` kernel uses an address space cast to _force_ the compiler to choose our desired address space, regardless of other optimizations that may be possible.
+
+We also note that the `filter` parameter passed in as a kernel argument (see [example](https://github.com/AMDResearch/omniperf/blob/dev/sample/vmem.hip), or [design note](Flat_design)) is set to zero on the host, such that we always write to the 'local' (LDS) memory allocation `lds`.
+
+Examining this kernel in the VMEM Instruction Mix table yields: 
+
+```shell-session
+$ omniperf analyze -p workloads/vmem/mi200/ --dispatch 2 -b 10.3 -n per_kernel
+<...>
+0. Top Stat
+╒════╤══════════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕
+│    │ KernelName                               │   Count │   Sum(ns) │   Mean(ns) │   Median(ns) │    Pct │
+╞════╪══════════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡
+│  0 │ generic_write(int*, int, int) [clone .kd │    1.00 │   2880.00 │    2880.00 │      2880.00 │ 100.00 │
+│    │ ]                                        │         │           │            │              │        │
+╘════╧══════════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛
+
+
+--------------------------------------------------------------------------------
+10. Compute Units - Instruction Mix
+10.3 VMEM Instr Mix
+╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕
+│ Index   │ Metric                │   Avg │   Min │   Max │ Unit             │
+╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡
+│ 10.3.0  │ Global/Generic Instr  │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.1  │ Global/Generic Read   │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.2  │ Global/Generic Write  │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.3  │ Global/Generic Atomic │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.4  │ Spill/Stack Instr     │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.5  │ Spill/Stack Read      │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.6  │ Spill/Stack Write     │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.7  │ Spill/Stack Atomic    │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛
+```
+
+As expected we see a single generic write (10.3.2).
+In the assembly generated for this kernel (in particular, we care about the `generic_store` function). We see that this corresponds to a `flat_store_dword` instruction:
+
+```asm
+        .type   _Z13generic_storePii,@function
+_Z13generic_storePii:                   ; @_Z13generic_storePii
+; %bb.0:
+        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+        flat_store_dword v[0:1], v2
+        s_waitcnt vmcnt(0) lgkmcnt(0)
+        s_setpc_b64 s[30:31]
+.Lfunc_end0:
+```
+
+In addition, we note that we can observe the destination of this request by looking at the LDS Instructions metric (12.2.0):
+```shell-session
+$ omniperf analyze -p workloads/vmem/mi200/ --dispatch 2 -b 12.2.0 -n per_kernel
+<...>
+12. Local Data Share (LDS)
+12.2 LDS Stats
+╒═════════╤════════════╤═══════╤═══════╤═══════╤══════════════════╕
+│ Index   │ Metric     │   Avg │   Min │   Max │ Unit             │
+╞═════════╪════════════╪═══════╪═══════╪═══════╪══════════════════╡
+│ 12.2.0  │ LDS Instrs │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+╘═════════╧════════════╧═══════╧═══════╧═══════╧══════════════════╛
+```
+which indicates one LDS access.
+
+```{note}
+Exercise for the reader: if this access had been targeted at global memory (e.g., by changing value of `filter`), where should we look for the memory traffic?  Hint: see our [generic read](Generic_read) example.
+```
+
+#### Global read
+
+Next, we examine a simple global read operation:
+
+```c++
+__global__ void global_read(int* ptr, int zero) {
+  int x = ptr[threadIdx.x];
+  if (x != zero) {
+    ptr[threadIdx.x] = x + 1;
+  }
+}
+```
+
+Here we observe a now familiar pattern:
+  - Read a value in from global memory
+  - Have a write hidden behind a conditional that is impossible for the compiler to statically eliminate, but is identically false. In this case, our `main()` function initializes the data in `ptr` to zero.
+
+Running Omniperf on this kernel yields:
+
+```shell-session
+$ omniperf analyze -p workloads/vmem/mi200/ --dispatch 3 -b 10.3 -n per_kernel
+<...>
+0. Top Stat
+╒════╤════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕
+│    │ KernelName                         │   Count │   Sum(ns) │   Mean(ns) │   Median(ns) │    Pct │
+╞════╪════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡
+│  0 │ global_read(int*, int) [clone .kd] │    1.00 │   4480.00 │    4480.00 │      4480.00 │ 100.00 │
+╘════╧════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛
+
+
+--------------------------------------------------------------------------------
+10. Compute Units - Instruction Mix
+10.3 VMEM Instr Mix
+╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕
+│ Index   │ Metric                │   Avg │   Min │   Max │ Unit             │
+╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡
+│ 10.3.0  │ Global/Generic Instr  │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.1  │ Global/Generic Read   │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.2  │ Global/Generic Write  │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.3  │ Global/Generic Atomic │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.4  │ Spill/Stack Instr     │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.5  │ Spill/Stack Read      │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.6  │ Spill/Stack Write     │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.7  │ Spill/Stack Atomic    │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛
+```
+
+Here we see a single global/generic instruction (10.3.0) which, as expected, is a read (10.3.1).
+
+(Generic_read)=
+#### Generic read from global memory
+
+For our generic read example, we choose to change our target for the generic read to be global memory:
+```c++
+__global__ void generic_read(int* ptr, int zero, int filter) {
+  __shared__ int lds[1024];
+  if (static_cast<int>(filter - 1) == zero) {
+    lds[threadIdx.x] = 0; // initialize to zero to avoid conditional, but hide behind _another_ conditional
+  }
+  int* generic;
+  if (static_cast<int>(threadIdx.x) > filter - 1) {
+    generic = &ptr[threadIdx.x];
+  } else {
+    generic = &lds[threadIdx.x];
+    abort();
+  }
+  int x = generic_load((generic_ptr)generic);
+  if (x != zero) {
+    ptr[threadIdx.x] = x + 1;
+  }
+}
+```
+
+In addition to our usual `if (condition_that_wont_happen)` guard around the write operation, there is an additional conditional around the initialization of the `lds` buffer.
+We note that it's typically required to write to this buffer to prevent the compiler from eliminating the local memory branch entirely due to undefined behavior (use of an uninitialized value).
+However, to report _only_ our global memory read, we again hide this initialization behind an identically false conditional (both `zero` and `filter` are set to zero in the kernel launch). Note that this is a _different_ conditional from our pointer assignment (to avoid combination of the two).
+
+Running Omniperf on this kernel reports:
+```shell-session
+$ omniperf analyze -p workloads/vmem/mi200/ --dispatch 4 -b 10.3 12.2.0 16.3.10 -n per_kernel
+<...>
+0. Top Stat
+╒════╤══════════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕
+│    │ KernelName                               │   Count │   Sum(ns) │   Mean(ns) │   Median(ns) │    Pct │
+╞════╪══════════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡
+│  0 │ generic_read(int*, int, int) [clone .kd] │    1.00 │   2240.00 │    2240.00 │      2240.00 │ 100.00 │
+╘════╧══════════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛
+
+
+--------------------------------------------------------------------------------
+10. Compute Units - Instruction Mix
+10.3 VMEM Instr Mix
+╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕
+│ Index   │ Metric                │   Avg │   Min │   Max │ Unit             │
+╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡
+│ 10.3.0  │ Global/Generic Instr  │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.1  │ Global/Generic Read   │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.2  │ Global/Generic Write  │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.3  │ Global/Generic Atomic │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.4  │ Spill/Stack Instr     │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.5  │ Spill/Stack Read      │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.6  │ Spill/Stack Write     │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.7  │ Spill/Stack Atomic    │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛
+
+
+--------------------------------------------------------------------------------
+12. Local Data Share (LDS)
+12.2 LDS Stats
+╒═════════╤════════════╤═══════╤═══════╤═══════╤══════════════════╕
+│ Index   │ Metric     │   Avg │   Min │   Max │ Unit             │
+╞═════════╪════════════╪═══════╪═══════╪═══════╪══════════════════╡
+│ 12.2.0  │ LDS Instrs │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+╘═════════╧════════════╧═══════╧═══════╧═══════╧══════════════════╛
+
+
+--------------------------------------------------------------------------------
+16. Vector L1 Data Cache
+16.3 L1D Cache Accesses
+╒═════════╤════════════╤═══════╤═══════╤═══════╤════════════════╕
+│ Index   │ Metric     │   Avg │   Min │   Max │ Unit           │
+╞═════════╪════════════╪═══════╪═══════╪═══════╪════════════════╡
+│ 16.3.10 │ L1-L2 Read │  1.00 │  1.00 │  1.00 │ Req per kernel │
+╘═════════╧════════════╧═══════╧═══════╧═══════╧════════════════╛
+```
+
+Here we observe:
+  - A single global/generic read operation (10.3.1), which
+  - Is not an LDS instruction (12.2), as seen in our [generic write](Generic_write) example, but is instead
+  - An L1-L2 read operation (16.3.10)
+
+That is, we have successfully targeted our generic read at global memory.
+Inspecting the assembly shows this corresponds to a `flat_load_dword` instruction.
+
+(Global_atomic)=
+#### Global atomic
+
+Our global atomic kernel:
+```c++
+__global__ void global_atomic(int* ptr, int zero) {
+  atomicAdd(ptr, zero);
+}
+```
+simply atomically adds a (non-compile-time) zero value to a pointer.
+
+Running Omniperf on this kernel yields:
+```shell-session
+$ omniperf analyze -p workloads/vmem/mi200/ --dispatch 5 -b 10.3 16.3.12 -n per_kernel
+<...>
+0. Top Stat
+╒════╤══════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕
+│    │ KernelName                           │   Count │   Sum(ns) │   Mean(ns) │   Median(ns) │    Pct │
+╞════╪══════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡
+│  0 │ global_atomic(int*, int) [clone .kd] │    1.00 │   4640.00 │    4640.00 │      4640.00 │ 100.00 │
+╘════╧══════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛
+
+
+--------------------------------------------------------------------------------
+10. Compute Units - Instruction Mix
+10.3 VMEM Instr Mix
+╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕
+│ Index   │ Metric                │   Avg │   Min │   Max │ Unit             │
+╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡
+│ 10.3.0  │ Global/Generic Instr  │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.1  │ Global/Generic Read   │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.2  │ Global/Generic Write  │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.3  │ Global/Generic Atomic │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.4  │ Spill/Stack Instr     │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.5  │ Spill/Stack Read      │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.6  │ Spill/Stack Write     │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.7  │ Spill/Stack Atomic    │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛
+
+
+--------------------------------------------------------------------------------
+16. Vector L1 Data Cache
+16.3 L1D Cache Accesses
+╒═════════╤══════════════╤═══════╤═══════╤═══════╤════════════════╕
+│ Index   │ Metric       │   Avg │   Min │   Max │ Unit           │
+╞═════════╪══════════════╪═══════╪═══════╪═══════╪════════════════╡
+│ 16.3.12 │ L1-L2 Atomic │  1.00 │  1.00 │  1.00 │ Req per kernel │
+╘═════════╧══════════════╧═══════╧═══════╧═══════╧════════════════╛
+```
+
+Here we see a single global/generic atomic instruction (10.3.3), which corresponds to an L1-L2 atomic request (16.3.12).
+
+(Generic_atomic)=
+#### Generic, mixed atomic
+
+In our final global/generic example, we look at a case where our generic operation targets both LDS and global memory:
+```c++
+__global__ void generic_atomic(int* ptr, int filter, int zero) {
+  __shared__ int lds[1024];
+  int* generic = (threadIdx.x % 2 == filter) ? &ptr[threadIdx.x] : &lds[threadIdx.x];
+  generic_atomic((generic_ptr)generic, zero);
+}
+```
+
+This assigns every other work-item to atomically update global memory or local memory.
+
+Running this kernel through Omniperf shows:
+```shell-session
+$ omniperf analyze -p workloads/vmem/mi200/ --dispatch 6 -b 10.3 12.2.0 16.3.12 -n per_kernel
+<...>
+0. Top Stat
+╒════╤══════════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕
+│    │ KernelName                               │   Count │   Sum(ns) │   Mean(ns) │   Median(ns) │    Pct │
+╞════╪══════════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡
+│  0 │ generic_atomic(int*, int, int) [clone .k │    1.00 │   3360.00 │    3360.00 │      3360.00 │ 100.00 │
+│    │ d]                                       │         │           │            │              │        │
+╘════╧══════════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛
+
+
+10. Compute Units - Instruction Mix
+10.3 VMEM Instr Mix
+╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕
+│ Index   │ Metric                │   Avg │   Min │   Max │ Unit             │
+╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡
+│ 10.3.0  │ Global/Generic Instr  │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.1  │ Global/Generic Read   │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.2  │ Global/Generic Write  │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.3  │ Global/Generic Atomic │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.4  │ Spill/Stack Instr     │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.5  │ Spill/Stack Read      │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.6  │ Spill/Stack Write     │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.7  │ Spill/Stack Atomic    │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛
+
+
+--------------------------------------------------------------------------------
+12. Local Data Share (LDS)
+12.2 LDS Stats
+╒═════════╤════════════╤═══════╤═══════╤═══════╤══════════════════╕
+│ Index   │ Metric     │   Avg │   Min │   Max │ Unit             │
+╞═════════╪════════════╪═══════╪═══════╪═══════╪══════════════════╡
+│ 12.2.0  │ LDS Instrs │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+╘═════════╧════════════╧═══════╧═══════╧═══════╧══════════════════╛
+
+
+--------------------------------------------------------------------------------
+16. Vector L1 Data Cache
+16.3 L1D Cache Accesses
+╒═════════╤══════════════╤═══════╤═══════╤═══════╤════════════════╕
+│ Index   │ Metric       │   Avg │   Min │   Max │ Unit           │
+╞═════════╪══════════════╪═══════╪═══════╪═══════╪════════════════╡
+│ 16.3.12 │ L1-L2 Atomic │  1.00 │  1.00 │  1.00 │ Req per kernel │
+╘═════════╧══════════════╧═══════╧═══════╧═══════╧════════════════╛
+```
+
+That is, we see:
+  - A single generic atomic instruction (10.3.3) that maps to both
+  - an LDS instruction (12.2.0), and
+  - an L1-L2 atomic request (16.3)
+
+We have demonstrated the ability of the generic address space to _dynamically_ target different backing memory!
+
+(buffermembench)=
+### Spill/Scratch (BUFFER)
+
+Next we examine the use of 'Spill/Scratch' memory.
+On current CDNA accelerators such as the [MI2XX](2xxnote), this is implemented using the [private](mspace) memory space, which maps to ['scratch' memory](https://llvm.org/docs/AMDGPUUsage.html#amdgpu-address-spaces) in AMDGPU hardware terminology.
+This type of memory can be accessed via different instructions depending on the specific architecture targeted. However, current CDNA accelerators such as the [MI2XX](2xxnote) use so called `buffer` instructions to access private memory in a simple (and typically) coalesced manner.  See [Sec. 9.1, 'Vector Memory Buffer Instructions' of the CDNA2 ISA guide](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf) for further reading on this instruction type.
+
+We develop a [simple kernel](https://github.com/AMDResearch/omniperf/blob/dev/sample/stack.hip) that uses stack memory:
+```c++
+#include <hip/hip_runtime.h>
+__global__ void knl(int* out, int filter) {
+  int x[1024];
+  x[filter] = 0;
+  if (threadIdx.x < filter)
+    out[threadIdx.x] = x[threadIdx.x];
+}
+```
+
+Our strategy here is to:
+  - Create a large stack buffer (that cannot reasonably fit into registers)
+  - Write to a compile-time unknown location on the stack, and then
+  - Behind the typical compile-time unknown `if(condition_that_wont_happen)`
+  - Read from a different, compile-time unknown, location on the stack and write to global memory to prevent the compiler from optimizing it out.
+
+This example was compiled and run on an MI250 accelerator using ROCm v5.6.0, and Omniperf v2.0.0.
+```shell-session
+$ hipcc -O3 stack.hip -o stack.hip
+```
+and profiled using omniperf:
+```shell-session
+$ omniperf profile -n stack --no-roof -- ./stack
+<...>
+$ omniperf analyze -p workloads/stack/mi200/  -b 10.3 16.3.11 -n per_kernel
+<...>
+10. Compute Units - Instruction Mix
+10.3 VMEM Instr Mix
+╒═════════╤═══════════════════════╤═══════╤═══════╤═══════╤══════════════════╕
+│ Index   │ Metric                │   Avg │   Min │   Max │ Unit             │
+╞═════════╪═══════════════════════╪═══════╪═══════╪═══════╪══════════════════╡
+│ 10.3.0  │ Global/Generic Instr  │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.1  │ Global/Generic Read   │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.2  │ Global/Generic Write  │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.3  │ Global/Generic Atomic │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.4  │ Spill/Stack Instr     │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.5  │ Spill/Stack Read      │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.6  │ Spill/Stack Write     │  1.00 │  1.00 │  1.00 │ Instr per kernel │
+├─────────┼───────────────────────┼───────┼───────┼───────┼──────────────────┤
+│ 10.3.7  │ Spill/Stack Atomic    │  0.00 │  0.00 │  0.00 │ Instr per kernel │
+╘═════════╧═══════════════════════╧═══════╧═══════╧═══════╧══════════════════╛
+
+
+--------------------------------------------------------------------------------
+16. Vector L1 Data Cache
+16.3 L1D Cache Accesses
+╒═════════╤═════════════╤═══════╤═══════╤═══════╤════════════════╕
+│ Index   │ Metric      │   Avg │   Min │   Max │ Unit           │
+╞═════════╪═════════════╪═══════╪═══════╪═══════╪════════════════╡
+│ 16.3.11 │ L1-L2 Write │  1.00 │  1.00 │  1.00 │ Req per kernel │
+╘═════════╧═════════════╧═══════╧═══════╧═══════╧════════════════╛
+```
+
+Here we see a single write to the stack (10.3.6), which corresponds to an L1-L2 write request (16.3.11), i.e., the stack is backed by global memory and travels through the same memory hierarchy.
+
+(IPC_example)=
+## Instructions-per-cycle and Utilizations example
+
+For this section, we use the instructions-per-cycle (IPC) [example](https://github.com/AMDResearch/omniperf/blob/dev/sample/ipc.hip) included with Omniperf.
+
+This example is compiled using `c++17` support:
+
+```shell-session
+$ hipcc -O3 ipc.hip -o ipc -std=c++17
+```
+
+and was run on an MI250 CDNA2 accelerator:
+
+```shell-session
+$ omniperf profile -n ipc --no-roof -- ./ipc
+```
+
+The results shown in this section are _generally_ applicable to CDNA accelerators, but may vary between generations and specific products.
+
+### Design note
+
+The kernels in this example all execute a specific assembly operation `N` times (1000, by default), for instance the `vmov` kernel:
+
+```c++
+template<int N=1000>
+__device__ void vmov_op() {
+    int dummy;
+    if constexpr (N >= 1) {
+        asm volatile("v_mov_b32 v0, v1\n" : : "{v31}"(dummy));
+        vmov_op<N - 1>();
+    }
+}
+
+template<int N=1000>
+__global__ void vmov() {
+    vmov_op<N>();
+}
+```
+
+The kernels are then launched twice, once for a warm-up run, and once for measurement.
+
+(VALU_ipc)=
+### VALU Utilization and IPC
+
+Now we can use our test to measure the achieved instructions-per-cycle of various types of instructions.
+We start with a simple [VALU](valu) operation, i.e., a `v_mov_b32` instruction, e.g.:
+
+```asm
+v_mov_b32 v0, v1
+```
+
+This instruction simply copies the contents from the source register (`v1`) to the destination register (`v0`).
+Investigating this kernel with Omniperf, we see:
+
+```shell-session
+$ omniperf analyze -p workloads/ipc/mi200/ --dispatch 7 -b 11.2
+<...>
+--------------------------------------------------------------------------------
+0. Top Stat
+╒════╤═══════════════════════════════╤═════════╤═════════════╤═════════════╤══════════════╤════════╕
+│    │ KernelName                    │   Count │     Sum(ns) │    Mean(ns) │   Median(ns) │    Pct │
+╞════╪═══════════════════════════════╪═════════╪═════════════╪═════════════╪══════════════╪════════╡
+│  0 │ void vmov<1000>() [clone .kd] │    1.00 │ 99317423.00 │ 99317423.00 │  99317423.00 │ 100.00 │
+╘════╧═══════════════════════════════╧═════════╧═════════════╧═════════════╧══════════════╧════════╛
+
+
+--------------------------------------------------------------------------------
+11. Compute Units - Compute Pipeline
+11.2 Pipeline Stats
+╒═════════╤═════════════════════╤═══════╤═══════╤═══════╤══════════════╕
+│ Index   │ Metric              │ Avg   │ Min   │ Max   │ Unit         │
+╞═════════╪═════════════════════╪═══════╪═══════╪═══════╪══════════════╡
+│ 11.2.0  │ IPC                 │ 1.0   │ 1.0   │ 1.0   │ Instr/cycle  │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.1  │ IPC (Issued)        │ 1.0   │ 1.0   │ 1.0   │ Instr/cycle  │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.2  │ SALU Util           │ 0.0   │ 0.0   │ 0.0   │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.3  │ VALU Util           │ 99.98 │ 99.98 │ 99.98 │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.4  │ VMEM Util           │ 0.0   │ 0.0   │ 0.0   │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.5  │ Branch Util         │ 0.1   │ 0.1   │ 0.1   │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.6  │ VALU Active Threads │ 64.0  │ 64.0  │ 64.0  │ Threads      │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.7  │ MFMA Util           │ 0.0   │ 0.0   │ 0.0   │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.8  │ MFMA Instr Cycles   │       │       │       │ Cycles/instr │
+╘═════════╧═════════════════════╧═══════╧═══════╧═══════╧══════════════╛
+```
+
+Here we see that:
+
+  1. Both the IPC (11.2.0) and "Issued" IPC (11.2.1) metrics are $\sim 1$
+  2. The VALU Utilization metric (11.2.3) is also $\sim100\%$, and finally
+  3. The VALU Active Threads metric (11.2.4) is 64, i.e., the wavefront size on CDNA accelerators, as all threads in the wavefront are active.
+
+We will explore the difference between the IPC (11.2.0) and "Issued" IPC (11.2.1) metrics in the [next section](Issued_ipc).
+
+Additionally, we notice a small (0.1%) Branch utilization (11.2.5).
+Inspecting the assembly of this kernel shows there are no branch operations, however recalling the note in the [Pipeline statistics](Pipeline_stats) section:
+
+> the Branch utilization <...> includes time spent in other instruction types (namely: `s_endpgm`) that are _typically_ a very small percentage of the overall kernel execution.
+
+we see that this is coming from execution of the `s_endpgm` instruction at the end of every wavefront.
+
+```{note}
+Technically, the cycle counts used in the denominators of our IPC metrics are actually in units of quad-cycles, a group of 4 consecutive cycles.
+However, a typical [VALU](valu) instruction on CDNA accelerators runs for a single quad-cycle (see [Layla Mah's GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah), slide 30).
+Therefore, for simplicity, we simply report these metrics as "instructions per cycle".
+```
+
+(Issued_ipc)=
+### Exploring "Issued" IPC via MFMA operations
+
+```{warning}
+The MFMA assembly operations used in this example are inherently unportable to older CDNA architectures.
+```
+
+Unlike the simple quad-cycle `v_mov_b32` operation discussed in our [previous example](VALU_ipc), some operations take many quad-cycles to execute.
+For example, using the [AMD Matrix Instruction Calculator](https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator#example-of-querying-instruction-information) we can see that some [MFMA](mfma) operations take 64 cycles, e.g.:
+
+```shell-session
+$ ./matrix_calculator.py --arch CDNA2 --detail-instruction --instruction v_mfma_f32_32x32x8bf16_1k
+Architecture: CDNA2
+Instruction: V_MFMA_F32_32X32X8BF16_1K
+<...>
+    Execution statistics:
+        FLOPs: 16384
+        Execution cycles: 64
+        FLOPs/CU/cycle: 1024
+        Can co-execute with VALU: True
+        VALU co-execution cycles possible: 60
+```
+
+What happens to our IPC when we utilize this `v_mfma_f32_32x32x8bf16_1k` instruction on a CDNA2 accelerator?
+To find out, we turn to our `mfma` kernel in the IPC example:
+
+```shell-session
+$ omniperf analyze -p workloads/ipc/mi200/ --dispatch 8 -b 11.2 --decimal 4
+<...>
+--------------------------------------------------------------------------------
+0. Top Stat
+╒════╤═══════════════════════════════╤═════════╤═════════════════╤═════════════════╤═════════════════╤══════════╕
+│    │ KernelName                    │   Count │         Sum(ns) │        Mean(ns) │      Median(ns) │      Pct │
+╞════╪═══════════════════════════════╪═════════╪═════════════════╪═════════════════╪═════════════════╪══════════╡
+│  0 │ void mfma<1000>() [clone .kd] │  1.0000 │ 1623167595.0000 │ 1623167595.0000 │ 1623167595.0000 │ 100.0000 │
+╘════╧═══════════════════════════════╧═════════╧═════════════════╧═════════════════╧═════════════════╧══════════╛
+
+
+--------------------------------------------------------------------------------
+11. Compute Units - Compute Pipeline
+11.2 Pipeline Stats
+╒═════════╤═════════════════════╤═════════╤═════════╤═════════╤══════════════╕
+│ Index   │ Metric              │     Avg │     Min │     Max │ Unit         │
+╞═════════╪═════════════════════╪═════════╪═════════╪═════════╪══════════════╡
+│ 11.2.0  │ IPC                 │  0.0626 │  0.0626 │  0.0626 │ Instr/cycle  │
+├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤
+│ 11.2.1  │ IPC (Issued)        │  1.0000 │  1.0000 │  1.0000 │ Instr/cycle  │
+├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤
+│ 11.2.2  │ SALU Util           │  0.0000 │  0.0000 │  0.0000 │ Pct          │
+├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤
+│ 11.2.3  │ VALU Util           │  6.2496 │  6.2496 │  6.2496 │ Pct          │
+├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤
+│ 11.2.4  │ VMEM Util           │  0.0000 │  0.0000 │  0.0000 │ Pct          │
+├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤
+│ 11.2.5  │ Branch Util         │  0.0062 │  0.0062 │  0.0062 │ Pct          │
+├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤
+│ 11.2.6  │ VALU Active Threads │ 64.0000 │ 64.0000 │ 64.0000 │ Threads      │
+├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤
+│ 11.2.7  │ MFMA Util           │ 99.9939 │ 99.9939 │ 99.9939 │ Pct          │
+├─────────┼─────────────────────┼─────────┼─────────┼─────────┼──────────────┤
+│ 11.2.8  │ MFMA Instr Cycles   │ 64.0000 │ 64.0000 │ 64.0000 │ Cycles/instr │
+╘═════════╧═════════════════════╧═════════╧═════════╧═════════╧══════════════╛
+```
+
+In contrast to our [VALU IPC example](VALU_ipc), we now see that the IPC metric (11.2.0) and Issued IPC (11.2.1) metric differ substantially.
+First, we see the VALU utilization (11.2.3) has decreased substantially, from nearly 100% to $\sim6.25\%$.
+We note that this matches the ratio of:
+
+```math
+((Execution\ cycles) - (VALU\ coexecution\ cycles)) / (Execution\ cycles)
+```
+reported by the matrix calculator, while the MFMA utilization (11.2.7) has increased to nearly 100%.
+
+
+Recall: our `v_mfma_f32_32x32x8bf16_1k` instruction takes 64 cycles to execute, or 16 quad-cycles, matching our observed MFMA Instruction Cycles (11.2.8).
+That is, we have a single instruction executed every 16 quad-cycles, or:
+
+```math
+1/16 = 0.0625
+```
+
+which is almost identical to our IPC metric (11.2.0).
+Why then is the Issued IPC metric (11.2.1) equal to 1.0 then?
+
+Instead of simply counting the number of instructions issued and dividing by the number of cycles the [CUs](CU) on the accelerator were active (as is done for 11.2.0), this metric is formulated differently, and instead counts the number of (non-[internal](Internal_ipc)) instructions issued divided by the number of (quad-) cycles where the [scheduler](scheduler) was actively working on issuing instructions.
+Thus the Issued IPC metric (11.2.1) gives more of a sense of "what percent of the total number of [scheduler](scheduler) cycles did a wave schedule an instruction?" while the IPC metric (11.2.0) indicates the ratio of the number of instructions executed over the total [active CU cycles](TotalActiveCUCycles).
+
+```{warning}
+There are further complications of the Issued IPC metric (11.2.1) that make its use more complicated.
+We will be explore that in the [subsequent section](Internal_ipc).
+For these reasons, Omniperf typically promotes use of the regular IPC metric (11.2.0), e.g., in the top-level Speed-of-Light chart.
+```
+
+(Internal_ipc)=
+### "Internal" instructions and IPC
+
+Next, we explore the concept of an "internal" instruction.
+From [Layla Mah's GCN Crash Course](https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah) (slide 29), we see a few candidates for internal instructions, and we choose a `s_nop` instruction, which according to the [CDNA2 ISA Guide](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf):
+
+>Does nothing; it can be repeated in hardware up to eight times.
+
+Here we choose to use a no-op of:
+
+```asm
+s_nop 0x0
+```
+
+to make our point.  Running this kernel through Omniperf yields:
+
+```shell-session
+$ omniperf analyze -p workloads/ipc/mi200/ --dispatch 9 -b 11.2
+<...>
+--------------------------------------------------------------------------------
+0. Top Stat
+╒════╤═══════════════════════════════╤═════════╤═════════════╤═════════════╤══════════════╤════════╕
+│    │ KernelName                    │   Count │     Sum(ns) │    Mean(ns) │   Median(ns) │    Pct │
+╞════╪═══════════════════════════════╪═════════╪═════════════╪═════════════╪══════════════╪════════╡
+│  0 │ void snop<1000>() [clone .kd] │    1.00 │ 14221851.50 │ 14221851.50 │  14221851.50 │ 100.00 │
+╘════╧═══════════════════════════════╧═════════╧═════════════╧═════════════╧══════════════╧════════╛
+
+
+--------------------------------------------------------------------------------
+11. Compute Units - Compute Pipeline
+11.2 Pipeline Stats
+╒═════════╤═════════════════════╤═══════╤═══════╤═══════╤══════════════╕
+│ Index   │ Metric              │ Avg   │ Min   │ Max   │ Unit         │
+╞═════════╪═════════════════════╪═══════╪═══════╪═══════╪══════════════╡
+│ 11.2.0  │ IPC                 │ 6.79  │ 6.79  │ 6.79  │ Instr/cycle  │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.1  │ IPC (Issued)        │ 1.0   │ 1.0   │ 1.0   │ Instr/cycle  │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.2  │ SALU Util           │ 0.0   │ 0.0   │ 0.0   │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.3  │ VALU Util           │ 0.0   │ 0.0   │ 0.0   │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.4  │ VMEM Util           │ 0.0   │ 0.0   │ 0.0   │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.5  │ Branch Util         │ 0.68  │ 0.68  │ 0.68  │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.6  │ VALU Active Threads │       │       │       │ Threads      │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.7  │ MFMA Util           │ 0.0   │ 0.0   │ 0.0   │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.8  │ MFMA Instr Cycles   │       │       │       │ Cycles/instr │
+╘═════════╧═════════════════════╧═══════╧═══════╧═══════╧══════════════╛
+```
+
+First, we see that the IPC metric (11.2.0) tops our theoretical maximum of 5 instructions per cycle (discussed in the [scheduler](scheduler) section).
+How can this be?
+
+Recall that Layla's slides say "no functional unit" for the internal instructions.
+This removes the limitation on the IPC. If we are _only_ issuing internal instructions, we are not issuing to any execution units!
+However, workloads such as these are almost _entirely_ artificial (i.e., repeatedly issuing internal instructions almost exclusively). In practice, a maximum of IPC of 5 is expected in almost all cases.
+
+Secondly, we note that our "Issued" IPC (11.2.1) is still identical to one here.
+Again, this has to do with the details of "internal" instructions.
+Recall in our [previous example](Issued_ipc) we defined this metric as explicitly excluding internal instruction counts.
+The logical question then is, 'what _is_ this metric counting in our `s_nop` kernel?'
+
+The generated assembly looks something like:
+
+```asm
+;;#ASMSTART
+s_nop 0x0
+;;#ASMEND
+;;#ASMSTART
+s_nop 0x0
+;;#ASMEND
+;;<... omitting many more ...>
+s_endpgm
+.section        .rodata,#alloc
+.p2align        6, 0x0
+.amdhsa_kernel _Z4snopILi1000EEvv
+```
+
+Of particular interest here is the `s_endpgm` instruction, of which the [CDNA2 ISA guide](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf) states:
+
+>End of program; terminate wavefront.
+
+This is not on our list of internal instructions from Layla's tutorial, and is therefore counted as part of our Issued IPC (11.2.1).
+Thus: the issued IPC being equal to one here indicates that we issued an `s_endpgm` instruction every cycle the [scheduler](scheduler) was active for non-internal instructions, which is expected as this was our _only_ non-internal instruction!
+
+
+(SALU_ipc)=
+### SALU Utilization
+
+Next, we explore a simple [SALU](salu) kernel in our on-going IPC and utilization example.
+For this case, we select a simple scalar move operation, e.g.:
+
+```asm
+s_mov_b32 s0, s1
+```
+
+which, in analogue to our [`v_mov`](VALU_ipc) example, copies the contents of the source scalar register (`s1`) to the destination scalar register (`s0`).
+Running this kernel through Omniperf yields:
+
+```shell-session
+$ omniperf analyze -p workloads/ipc/mi200/ --dispatch 10 -b 11.2
+<...>
+--------------------------------------------------------------------------------
+0. Top Stat
+╒════╤═══════════════════════════════╤═════════╤═════════════╤═════════════╤══════════════╤════════╕
+│    │ KernelName                    │   Count │     Sum(ns) │    Mean(ns) │   Median(ns) │    Pct │
+╞════╪═══════════════════════════════╪═════════╪═════════════╪═════════════╪══════════════╪════════╡
+│  0 │ void smov<1000>() [clone .kd] │    1.00 │ 96246554.00 │ 96246554.00 │  96246554.00 │ 100.00 │
+╘════╧═══════════════════════════════╧═════════╧═════════════╧═════════════╧══════════════╧════════╛
+
+
+--------------------------------------------------------------------------------
+11. Compute Units - Compute Pipeline
+11.2 Pipeline Stats
+╒═════════╤═════════════════════╤═══════╤═══════╤═══════╤══════════════╕
+│ Index   │ Metric              │ Avg   │ Min   │ Max   │ Unit         │
+╞═════════╪═════════════════════╪═══════╪═══════╪═══════╪══════════════╡
+│ 11.2.0  │ IPC                 │ 1.0   │ 1.0   │ 1.0   │ Instr/cycle  │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.1  │ IPC (Issued)        │ 1.0   │ 1.0   │ 1.0   │ Instr/cycle  │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.2  │ SALU Util           │ 99.98 │ 99.98 │ 99.98 │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.3  │ VALU Util           │ 0.0   │ 0.0   │ 0.0   │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.4  │ VMEM Util           │ 0.0   │ 0.0   │ 0.0   │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.5  │ Branch Util         │ 0.1   │ 0.1   │ 0.1   │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.6  │ VALU Active Threads │       │       │       │ Threads      │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.7  │ MFMA Util           │ 0.0   │ 0.0   │ 0.0   │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.8  │ MFMA Instr Cycles   │       │       │       │ Cycles/instr │
+╘═════════╧═════════════════════╧═══════╧═══════╧═══════╧══════════════╛
+```
+
+Here we see that:
+  - both our IPC (11.2.0) and Issued IPC (11.2.1) are $\sim1.0$ as expected, and, 
+  - the SALU Utilization (11.2.2) was nearly 100% as it was active for almost the entire kernel.
+
+(VALU_Active_Threads)=
+### VALU Active Threads
+
+For our final IPC/Utilization example, we consider a slight modification of our [`v_mov`](VALU_ipc) example:
+
+```c++
+template<int N=1000>
+__global__ void vmov_with_divergence() {
+    if (threadIdx.x % 64 == 0)
+        vmov_op<N>();
+}
+```
+
+That is, we wrap our [VALU](valu) operation inside a conditional where only one lane in our wavefront is active.
+Running this kernel through Omniperf yields:
+
+```shell-session
+$ omniperf analyze -p workloads/ipc/mi200/ --dispatch 11 -b 11.2
+<...>
+--------------------------------------------------------------------------------
+0. Top Stat
+╒════╤══════════════════════════════════════════╤═════════╤═════════════╤═════════════╤══════════════╤════════╕
+│    │ KernelName                               │   Count │     Sum(ns) │    Mean(ns) │   Median(ns) │    Pct │
+╞════╪══════════════════════════════════════════╪═════════╪═════════════╪═════════════╪══════════════╪════════╡
+│  0 │ void vmov_with_divergence<1000>() [clone │    1.00 │ 97125097.00 │ 97125097.00 │  97125097.00 │ 100.00 │
+│    │  .kd]                                    │         │             │             │              │        │
+╘════╧══════════════════════════════════════════╧═════════╧═════════════╧═════════════╧══════════════╧════════╛
+
+
+--------------------------------------------------------------------------------
+11. Compute Units - Compute Pipeline
+11.2 Pipeline Stats
+╒═════════╤═════════════════════╤═══════╤═══════╤═══════╤══════════════╕
+│ Index   │ Metric              │ Avg   │ Min   │ Max   │ Unit         │
+╞═════════╪═════════════════════╪═══════╪═══════╪═══════╪══════════════╡
+│ 11.2.0  │ IPC                 │ 1.0   │ 1.0   │ 1.0   │ Instr/cycle  │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.1  │ IPC (Issued)        │ 1.0   │ 1.0   │ 1.0   │ Instr/cycle  │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.2  │ SALU Util           │ 0.1   │ 0.1   │ 0.1   │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.3  │ VALU Util           │ 99.98 │ 99.98 │ 99.98 │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.4  │ VMEM Util           │ 0.0   │ 0.0   │ 0.0   │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.5  │ Branch Util         │ 0.2   │ 0.2   │ 0.2   │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.6  │ VALU Active Threads │ 1.13  │ 1.13  │ 1.13  │ Threads      │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.7  │ MFMA Util           │ 0.0   │ 0.0   │ 0.0   │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.8  │ MFMA Instr Cycles   │       │       │       │ Cycles/instr │
+╘═════════╧═════════════════════╧═══════╧═══════╧═══════╧══════════════╛
+```
+
+Here we see that once again, our VALU Utilization (11.2.3) is nearly 100%.
+However, we note that the VALU Active Threads metric (11.2.6) is $\sim 1$, which matches our conditional in the source code.
+So VALU Active Threads reports the average number of lanes of our wavefront that are active over all [VALU](valu) instructions, or thread "convergence" (i.e., 1 - [divergence](Divergence)).
+
+```{note}
+We note here that:
+
+1. The act of evaluating a vector conditional in this example typically triggers VALU operations, contributing to why the VALU Active Threads metric is not identically one.
+2. This metric is a time (cycle) averaged value, and thus contains an implicit dependence on the duration of various VALU instructions.
+
+Nonetheless, this metric serves as a useful measure of thread-convergence.
+```
+
+Finally, we note that our branch utilization (11.2.5) has increased slightly from our baseline, as we now have a branch (checking the value of `threadIdx.x`).
+
+## LDS Examples
+
+For this example, we consider the [LDS sample](https://github.com/AMDResearch/omniperf/blob/dev/sample/lds.hip) distributed as a part of Omniperf.
+This code contains two kernels to explore how both [LDS](lds) bandwidth and bank conflicts are calculated in Omniperf.
+
+This example was compiled and run on an MI250 accelerator using ROCm v5.6.0, and Omniperf v2.0.0.
+```shell-session
+$ hipcc -O3 lds.hip -o lds
+```
+
+Finally, we generate our omniperf profile as:
+```shell-session
+$ omniperf profile -n lds --no-roof -- ./lds
+```
+
+(lds_bandwidth)=
+### LDS Bandwidth
+
+To explore our 'theoretical LDS bandwidth' metric, we use a simple kernel:
+
+```c++
+constexpr unsigned max_threads = 256;
+__global__ void load(int* out, int flag) {
+  __shared__ int array[max_threads];
+  int index = threadIdx.x;
+  // fake a store to the LDS array to avoid unwanted behavior
+  if (flag)
+    array[max_threads - index] = index;
+  __syncthreads();
+  int x = array[index];
+  if (x == int(-1234567))
+    out[threadIdx.x] = x;
+}
+```
+
+Here we:
+  - Create an array of 256 integers in [LDS](lds)
+  - Fake a write to the LDS using the `flag` variable (always set to zero on the host) to avoid dead-code elimination
+  - Read a single integer per work-item from `threadIdx.x` of the LDS array
+  - If the integer is equal to a magic number (always false), write the value out to global memory to again, avoid dead-code elimination
+
+Finally, we launch this kernel repeatedly, varying the number of threads in our workgroup:
+
+```c++
+void bandwidth_demo(int N) {
+  for (int i = 1; i <= N; ++i)
+    load<<<1,i>>>(nullptr, 0);
+  hipDeviceSynchronize();
+}
+```
+
+Next, let's analyze the first of our bandwidth kernel dispatches:
+
+```shell-session
+$ omniperf analyze -p workloads/lds/mi200/ -b 12.2.1 --dispatch 0 -n per_kernel
+<...>
+12. Local Data Share (LDS)
+12.2 LDS Stats
+╒═════════╤═══════════════════════╤════════╤════════╤════════╤══════════════════╕
+│ Index   │ Metric                │    Avg │    Min │    Max │ Unit             │
+╞═════════╪═══════════════════════╪════════╪════════╪════════╪══════════════════╡
+│ 12.2.1  │ Theoretical Bandwidth │ 256.00 │ 256.00 │ 256.00 │ Bytes per kernel │
+╘═════════╧═══════════════════════╧════════╧════════╧════════╧══════════════════╛
+```
+
+Here we see that our Theoretical Bandwidth metric (12.2.1) is reporting 256 Bytes were loaded even though we launched a single work-item workgroup, and thus only loaded a single integer from LDS.  Why is this?
+
+Recall our definition of this metric:
+
+> Indicates the maximum amount of bytes that could have been loaded from/stored to/atomically updated in the LDS per [normalization-unit](normunit).
+
+Here we see that this instruction _could_ have loaded up to 256 bytes of data (4 bytes for each work-item in the wavefront), and therefore this is the expected value for this metric in Omniperf, hence why this metric is named the "theoretical" bandwidth.
+
+To further illustrate this point we plot the relationship of the theoretical bandwidth metric (12.2.1) as compared to the effective (or achieved) bandwidth of this kernel, varying the number of work-items launched from 1 to 256:
+
+```{figure} images/ldsbandwidth.*
+:scale: 50 %
+:alt: Comparison of effective bandwidth versus the theoretical bandwidth metric in Omniperf for our simple example.
+:align: center
+
+Comparison of effective bandwidth versus the theoretical bandwidth metric in Omniperf for our simple example.
+```
+
+Here we see that the theoretical bandwidth metric follows a step-function. It increases only when another wavefront issues an LDS instruction for up to 256 bytes of data. Such increases are marked in the plot using dashed lines.
+In contrast, the effective bandwidth increases linearly, by 4 bytes, with the number of work-items in the kernel, N.
+
+(lds_bank_conflicts)=
+### Bank Conflicts
+
+Next we explore bank conflicts using a slight modification of our bandwidth kernel:
+
+```c++
+constexpr unsigned nbanks = 32;
+__global__ void conflicts(int* out, int flag) {
+  constexpr unsigned nelements = nbanks * max_threads;
+  __shared__ int array[nelements];
+  // each thread reads from the same bank
+  int index = threadIdx.x * nbanks;
+  // fake a store to the LDS array to avoid unwanted behavior
+  if (flag)
+    array[max_threads - index] = index;
+  __syncthreads();
+  int x = array[index];
+  if (x == int(-1234567))
+    out[threadIdx.x] = x;
+}
+```
+
+Here we:
+  - Allocate an [LDS](lds) array of size $32*256*4{B}=32{KiB}$
+  - Fake a write to the LDS using the `flag` variable (always set to zero on the host) to avoid dead-code elimination
+  - Read a single integer per work-item from index `threadIdx.x * nbanks` of the LDS array
+  - If the integer is equal to a magic number (always false), write the value out to global memory to, again, avoid dead-code elimination.
+
+On the host, we again repeatedly launch this kernel, varying the number of work-items:
+
+```c++
+void conflicts_demo(int N) {
+  for (int i = 1; i <= N; ++i)
+    conflicts<<<1,i>>>(nullptr, 0);
+  hipDeviceSynchronize();
+}
+```
+
+Analyzing our first `conflicts` kernel (i.e., a single work-item), we see:
+
+```shell-session
+$ omniperf analyze -p workloads/lds/mi200/ -b 12.2.4 12.2.6 --dispatch 256 -n per_kernel
+<...>
+--------------------------------------------------------------------------------
+12. Local Data Share (LDS)
+12.2 LDS Stats
+╒═════════╤════════════════╤═══════╤═══════╤═══════╤═══════════════════╕
+│ Index   │ Metric         │   Avg │   Min │   Max │ Unit              │
+╞═════════╪════════════════╪═══════╪═══════╪═══════╪═══════════════════╡
+│ 12.2.4  │ Index Accesses │  2.00 │  2.00 │  2.00 │ Cycles per kernel │
+├─────────┼────────────────┼───────┼───────┼───────┼───────────────────┤
+│ 12.2.6  │ Bank Conflict  │  0.00 │  0.00 │  0.00 │ Cycles per kernel │
+╘═════════╧════════════════╧═══════╧═══════╧═══════╧═══════════════════╛
+```
+
+In our [previous example](lds_bank_conflicts), we showed how a load from a single work-item is considered to have a theoretical bandwidth of 256B.
+Recall, the [LDS](lds) can load up to $128B$ per cycle (i.e, 32 banks x 4B / bank / cycle).
+Hence, we see that loading an 4B integer spends two cycles accessing the LDS ($2\ {cycle} = (256B) / (128\ B/{cycle})$).
+
+Looking at the next `conflicts` dispatch (i.e., two work-items) yields:
+
+```shell-session
+$ omniperf analyze -p workloads/lds/mi200/ -b 12.2.4 12.2.6 --dispatch 257 -n per_kernel
+<...>
+--------------------------------------------------------------------------------
+12. Local Data Share (LDS)
+12.2 LDS Stats
+╒═════════╤════════════════╤═══════╤═══════╤═══════╤═══════════════════╕
+│ Index   │ Metric         │   Avg │   Min │   Max │ Unit              │
+╞═════════╪════════════════╪═══════╪═══════╪═══════╪═══════════════════╡
+│ 12.2.4  │ Index Accesses │  3.00 │  3.00 │  3.00 │ Cycles per kernel │
+├─────────┼────────────────┼───────┼───────┼───────┼───────────────────┤
+│ 12.2.6  │ Bank Conflict  │  1.00 │  1.00 │  1.00 │ Cycles per kernel │
+╘═════════╧════════════════╧═══════╧═══════╧═══════╧═══════════════════╛
+```
+
+Here we see a bank conflict!  What happened?
+
+Recall that the index for each thread was calculated as:
+
+```c++
+int index = threadIdx.x * nbanks;
+```
+
+Or, precisely 32 elements, and each element is 4B wide (for a standard integer).
+That is, each thread strides back to the same bank in the LDS, such that each work-item we add to the dispatch results in another bank conflict!
+
+Recalling our discussion of bank conflicts in our [LDS](lds) description:
+
+>A bank conflict occurs when two (or more) work-items in a wavefront want to read, write, or atomically update different addresses that map to the same bank in the same cycle.
+In this case, the conflict detection hardware will determined a new schedule such that the **access is split into multiple cycles with no conflicts in any single cycle.**
+
+Here we see the conflict resolution hardware in action!  Because we have engineered our kernel to generate conflicts, we expect our bank conflict metric to scale linearly with the number of work-items:
+
+```{figure} images/ldsconflicts.*
+:scale: 50 %
+:alt: Comparison of LDS conflict cycles versus access cycles for our simple example.
+:align: center
+
+Comparison of LDS conflict cycles versus access cycles for our simple example.
+```
+
+Here we show the comparison of the Index Accesses (12.2.4), to the Bank Conflicts (12.2.6) for the first 20 kernel invocations.
+We see that each grows linearly, and there is a constant gap of 2 cycles between them (i.e., the first access is never considered a conflict).
+
+
+Finally, we can use these two metrics to derive the Bank Conflict Rate (12.1.4).  Since within an Index Access we have 32 banks that may need to be updated, we use:
+
+$$
+Bank\ Conflict\ Rate = 100 * ((Bank\ Conflicts / 32) / (Index\ Accesses - Bank\ Conflicts))
+$$
+
+Plotting this, we see:
+
+```{figure} images/ldsconflictrate.*
+:scale: 50 %
+:alt: LDS Bank Conflict rate for our simple example.
+:align: center
+
+LDS Bank Conflict rate for our simple example.
+```
+
+The bank conflict rate linearly increases with the number of work-items within a wavefront that are active, _approaching_ 100\%, but never quite reaching it.
+
+
+(Occupancy_example)=
+## Occupancy Limiters Example 
+
+
+In this [example](https://github.com/AMDResearch/omniperf/blob/dev/sample/occupancy.hip), we will investigate the use of the resource allocation panel in the [Workgroup Manager](SPI)'s metrics section to determine occupancy limiters.
+This code contains several kernels to explore how both various kernel resources impact achieved occupancy, and how this is reported in Omniperf.
+
+This example was compiled and run on a MI250 accelerator using ROCm v5.6.0, and Omniperf v2.0.0:
+```shell-session
+$ hipcc -O3 occupancy.hip -o occupancy --save-temps
+```
+We have again included the `--save-temps` flag to get the corresponding assembly.
+
+Finally, we generate our Omniperf profile as:
+```shell-session
+$ omniperf profile -n occupancy --no-roof -- ./occupancy
+```
+
+(Occupancy_experiment_design)=
+### Design note
+
+For our occupancy test, we need to create a kernel that is resource heavy, in various ways.
+For this purpose, we use the following (somewhat funny-looking) kernel:
+
+```c++
+constexpr int bound = 16;
+__launch_bounds__(256)
+__global__ void vgprbound(int N, double* ptr) {
+    double intermediates[bound];
+    for (int i = 0 ; i < bound; ++i) intermediates[i] = N * threadIdx.x;
+    double x = ptr[threadIdx.x];
+    for (int i = 0; i < 100; ++i) {
+        x += sin(pow(__shfl(x, i % warpSize) * intermediates[(i - 1) % bound], intermediates[i % bound]));
+        intermediates[i % bound] = x;
+    }
+    if (x == N) ptr[threadIdx.x] = x;
+}
+```
+
+Here we try to use as many [VGPRs](valu) as possible, to this end:
+  - We create a small array of double precision floats, that we size to try to fit into registers (i.e., `bound`, this may need to be tuned depending on the ROCm version).
+  - We specify `__launch_bounds___(256)` to increase the number of VPGRs available to the kernel (by limiting the number of wavefronts that can be resident on a [CU](CU)).
+  - Write a unique non-compile time constant to each element of the array.
+  - Repeatedly permute and call relatively expensive math functions on our array elements.
+  - Keep the compiler from optimizing out any operations by faking a write to the `ptr` based on a run-time conditional.
+
+This yields a total of 122 VGPRs, but it is expected this number will depend on the exact ROCm/compiler version.
+
+```asm
+        .size   _Z9vgprboundiPd, .Lfunc_end1-_Z9vgprboundiPd
+                                        ; -- End function
+        .section        .AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 4732
+; NumSgprs: 68
+; NumVgprs: 122
+; NumAgprs: 0
+; <...>
+; AccumOffset: 124
+```
+
+We will use various permutations of this kernel to limit occupancy, and more importantly for the purposes of this example, demonstrate how this is reported in Omniperf.
+
+(VGPR_occupancy)=
+### VGPR Limited
+
+For our first test, we use the `vgprbound` kernel discussed in the [design note](Occupancy_experiment_design).
+After profiling, we run the analyze step on this kernel:
+
+```shell-session
+$ omniperf analyze -p workloads/occupancy/mi200/ -b 2.1.15 6.2 7.1.5 7.1.6 7.1.7 --dispatch 1
+<...>
+--------------------------------------------------------------------------------
+0. Top Stat
+╒════╤═════════════════════════╤═════════╤══════════════╤══════════════╤══════════════╤════════╕
+│    │ KernelName              │   Count │      Sum(ns) │     Mean(ns) │   Median(ns) │    Pct │
+╞════╪═════════════════════════╪═════════╪══════════════╪══════════════╪══════════════╪════════╡
+│  0 │ vgprbound(int, double*) │    1.00 │ 923093822.50 │ 923093822.50 │ 923093822.50 │ 100.00 │
+╘════╧═════════════════════════╧═════════╧══════════════╧══════════════╧══════════════╧════════╛
+
+
+--------------------------------------------------------------------------------
+2. System Speed-of-Light
+2.1 Speed-of-Light
+╒═════════╤═════════════════════╤═════════╤════════════╤═════════╤═══════════════╕
+│ Index   │ Metric              │     Avg │ Unit       │    Peak │   Pct of Peak │
+╞═════════╪═════════════════════╪═════════╪════════════╪═════════╪═══════════════╡
+│ 2.1.15  │ Wavefront Occupancy │ 1661.24 │ Wavefronts │ 3328.00 │         49.92 │
+╘═════════╧═════════════════════╧═════════╧════════════╧═════════╧═══════════════╛
+
+
+--------------------------------------------------------------------------------
+6. Workgroup Manager (SPI)
+6.2 Workgroup Manager - Resource Allocation
+╒═════════╤════════════════════════════════════════╤═══════╤═══════╤═══════╤════════╕
+│ Index   │ Metric                                 │   Avg │   Min │   Max │ Unit   │
+╞═════════╪════════════════════════════════════════╪═══════╪═══════╪═══════╪════════╡
+│ 6.2.0   │ Not-scheduled Rate (Workgroup Manager) │  0.64 │  0.64 │  0.64 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.1   │ Not-scheduled Rate (Scheduler-Pipe)    │ 24.94 │ 24.94 │ 24.94 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.2   │ Scheduler-Pipe Stall Rate              │ 24.49 │ 24.49 │ 24.49 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.3   │ Scratch Stall Rate                     │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.4   │ Insufficient SIMD Waveslots            │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.5   │ Insufficient SIMD VGPRs                │ 94.90 │ 94.90 │ 94.90 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.6   │ Insufficient SIMD SGPRs                │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.7   │ Insufficient CU LDS                    │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.8   │ Insufficient CU Barriers               │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.9   │ Reached CU Workgroup Limit             │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.10  │ Reached CU Wavefront Limit             │  0.00 │  0.00 │  0.00 │ Pct    │
+╘═════════╧════════════════════════════════════════╧═══════╧═══════╧═══════╧════════╛
+
+
+--------------------------------------------------------------------------------
+7. Wavefront
+7.1 Wavefront Launch Stats
+╒═════════╤══════════╤════════╤════════╤════════╤═══════════╕
+│ Index   │ Metric   │    Avg │    Min │    Max │ Unit      │
+╞═════════╪══════════╪════════╪════════╪════════╪═══════════╡
+│ 7.1.5   │ VGPRs    │ 124.00 │ 124.00 │ 124.00 │ Registers │
+├─────────┼──────────┼────────┼────────┼────────┼───────────┤
+│ 7.1.6   │ AGPRs    │   4.00 │   4.00 │   4.00 │ Registers │
+├─────────┼──────────┼────────┼────────┼────────┼───────────┤
+│ 7.1.7   │ SGPRs    │  80.00 │  80.00 │  80.00 │ Registers │
+╘═════════╧══════════╧════════╧════════╧════════╧═══════════╛
+```
+
+Here we see that the kernel indeed does use _around_ (but not exactly) 122 VGPRs, with the difference due to granularity of VGPR allocations.
+In addition, we see that we have allocated 4 "[AGPRs](agprs)".
+We note that on current CDNA2 accelerators, the `AccumOffset` field of the assembly metadata:
+```asm
+; AccumOffset: 124
+```
+denotes the divide between `VGPRs` and `AGPRs`.
+
+
+Next, we examine our wavefront occupancy (2.1.15), and see that we are reaching only $\sim50\%$ of peak occupancy.
+As a result, we see that:
+  - We are not scheduling workgroups $\sim25\%$ of [total scheduler-pipe cycles](TotalPipeCycles) (6.2.1); recall from the discussion of the [Workgroup manager](SPI), 25\% is the maximum.
+  - The scheduler-pipe is stalled (6.2.2) from scheduling workgroups due to resource constraints for the same $\sim25\%$ of the time. 
+  - And finally, $\sim91\%$ of those stalls are due to a lack of SIMDs with the appropriate number of VGPRs available (6.2.5).
+
+That is, the reason we can't reach full occupancy is due to our VGPR usage, as expected!
+
+### LDS Limited
+
+To examine an LDS limited example, we must change our kernel slightly:
+
+```c++
+constexpr size_t fully_allocate_lds = 64ul * 1024ul / sizeof(double);
+__launch_bounds__(256)
+__global__ void ldsbound(int N, double* ptr) {
+    __shared__ double intermediates[fully_allocate_lds];
+    for (int i = threadIdx.x ; i < fully_allocate_lds; i += blockDim.x) intermediates[i] = N * threadIdx.x;
+    __syncthreads();
+    double x = ptr[threadIdx.x];
+    for (int i = threadIdx.x; i < fully_allocate_lds; i += blockDim.x) {
+        x += sin(pow(__shfl(x, i % warpSize) * intermediates[(i - 1) % fully_allocate_lds], intermediates[i % fully_allocate_lds]));
+        __syncthreads();
+        intermediates[i % fully_allocate_lds] = x;
+    }
+    if (x == N) ptr[threadIdx.x] = x;
+}
+```
+
+where we now:
+  - allocate an 64 KiB LDS array per workgroup, and
+  - use our allocated LDS array instead of a register array
+
+Analyzing this:
+
+```shell-session
+$ omniperf analyze -p workloads/occupancy/mi200/ -b 2.1.15 6.2 7.1.5 7.1.6 7.1.7 7.1.8 --dispatch 3
+<...>
+--------------------------------------------------------------------------------
+2. System Speed-of-Light
+2.1 Speed-of-Light
+╒═════════╤═════════════════════╤════════╤════════════╤═════════╤═══════════════╕
+│ Index   │ Metric              │    Avg │ Unit       │    Peak │   Pct of Peak │
+╞═════════╪═════════════════════╪════════╪════════════╪═════════╪═══════════════╡
+│ 2.1.15  │ Wavefront Occupancy │ 415.52 │ Wavefronts │ 3328.00 │         12.49 │
+╘═════════╧═════════════════════╧════════╧════════════╧═════════╧═══════════════╛
+
+
+--------------------------------------------------------------------------------
+6. Workgroup Manager (SPI)
+6.2 Workgroup Manager - Resource Allocation
+╒═════════╤════════════════════════════════════════╤═══════╤═══════╤═══════╤════════╕
+│ Index   │ Metric                                 │   Avg │   Min │   Max │ Unit   │
+╞═════════╪════════════════════════════════════════╪═══════╪═══════╪═══════╪════════╡
+│ 6.2.0   │ Not-scheduled Rate (Workgroup Manager) │  0.13 │  0.13 │  0.13 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.1   │ Not-scheduled Rate (Scheduler-Pipe)    │ 24.87 │ 24.87 │ 24.87 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.2   │ Scheduler-Pipe Stall Rate              │ 24.84 │ 24.84 │ 24.84 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.3   │ Scratch Stall Rate                     │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.4   │ Insufficient SIMD Waveslots            │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.5   │ Insufficient SIMD VGPRs                │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.6   │ Insufficient SIMD SGPRs                │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.7   │ Insufficient CU LDS                    │ 96.47 │ 96.47 │ 96.47 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.8   │ Insufficient CU Barriers               │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.9   │ Reached CU Workgroup Limit             │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.10  │ Reached CU Wavefront Limit             │  0.00 │  0.00 │  0.00 │ Pct    │
+╘═════════╧════════════════════════════════════════╧═══════╧═══════╧═══════╧════════╛
+
+
+--------------------------------------------------------------------------------
+7. Wavefront
+7.1 Wavefront Launch Stats
+╒═════════╤════════════════╤══════════╤══════════╤══════════╤═══════════╕
+│ Index   │ Metric         │      Avg │      Min │      Max │ Unit      │
+╞═════════╪════════════════╪══════════╪══════════╪══════════╪═══════════╡
+│ 7.1.5   │ VGPRs          │    96.00 │    96.00 │    96.00 │ Registers │
+├─────────┼────────────────┼──────────┼──────────┼──────────┼───────────┤
+│ 7.1.6   │ AGPRs          │     0.00 │     0.00 │     0.00 │ Registers │
+├─────────┼────────────────┼──────────┼──────────┼──────────┼───────────┤
+│ 7.1.7   │ SGPRs          │    80.00 │    80.00 │    80.00 │ Registers │
+├─────────┼────────────────┼──────────┼──────────┼──────────┼───────────┤
+│ 7.1.8   │ LDS Allocation │ 65536.00 │ 65536.00 │ 65536.00 │ Bytes     │
+╘═════════╧════════════════╧══════════╧══════════╧══════════╧═══════════╛
+```
+
+We see that our VGPR allocation has gone down to 96 registers, but now we see our 64KiB LDS allocation (7.1.8).
+In addition, we see a similar non-schedule rate (6.2.1) and stall rate (6.2.2) as in our [VGPR example](VGPR_occupancy). However, our occupancy limiter has now shifted from VGPRs (6.2.5) to LDS (6.2.7).
+
+
+We note that although we see the around the same scheduler/stall rates (with our LDS limiter), our wave occupancy (2.1.15) is significantly lower ($\sim12\%$)!
+This is important to remember: the occupancy limiter metrics in the resource allocation section tell you what the limiter was, but _not_ how much the occupancy was limited.
+These metrics should always be analyzed in concert with the wavefront occupancy metric!
+
+### SGPR Limited
+
+Finally, we modify our kernel once more to make it limited by [SGPRs](salu):
+
+```c++
+constexpr int sgprlim = 1;
+__launch_bounds__(1024, 8)
+__global__ void sgprbound(int N, double* ptr) {
+    double intermediates[sgprlim];
+    for (int i = 0 ; i < sgprlim; ++i) intermediates[i] = i;
+    double x = ptr[0];
+    #pragma unroll 1
+    for (int i = 0; i < 100; ++i) {
+        x += sin(pow(intermediates[(i - 1) % sgprlim], intermediates[i % sgprlim]));
+        intermediates[i % sgprlim] = x;
+    }
+    if (x == N) ptr[0] = x;
+}
+```
+
+The major changes here are to:
+  - make as much as possible provably uniform across the wave (notice the lack of `threadIdx.x` in the `intermediates` initialization and elsewhere),
+  - addition of `__launch_bounds__(1024, 8)`, which reduces our maximum VGPRs to 64 (such that 8 waves can fit per SIMD), but causes some register spills (i.e., [Scratch](Mspace) usage), and
+  - lower the `bound` (here we use `sgprlim`) of the array to reduce VGPR/Scratch usage
+
+This results in the following assembly metadata for this kernel:
+```asm
+        .size   _Z9sgprboundiPd, .Lfunc_end3-_Z9sgprboundiPd
+                                        ; -- End function
+        .section        .AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 4872
+; NumSgprs: 76
+; NumVgprs: 64
+; NumAgprs: 0
+; TotalNumVgprs: 64
+; ScratchSize: 60
+; <...>
+; AccumOffset: 64
+; Occupancy: 8
+```
+
+Analyzing this workload yields:
+
+```shell-session
+$ omniperf analyze -p workloads/occupancy/mi200/ -b 2.1.15 6.2 7.1.5 7.1.6 7.1.7 7.1.8 7.1.9 --dispatch 5
+<...>
+--------------------------------------------------------------------------------
+0. Top Stat
+╒════╤═════════════════════════╤═════════╤══════════════╤══════════════╤══════════════╤════════╕
+│    │ KernelName              │   Count │      Sum(ns) │     Mean(ns) │   Median(ns) │    Pct │
+╞════╪═════════════════════════╪═════════╪══════════════╪══════════════╪══════════════╪════════╡
+│  0 │ sgprbound(int, double*) │    1.00 │ 782069812.00 │ 782069812.00 │ 782069812.00 │ 100.00 │
+╘════╧═════════════════════════╧═════════╧══════════════╧══════════════╧══════════════╧════════╛
+
+
+--------------------------------------------------------------------------------
+2. System Speed-of-Light
+2.1 Speed-of-Light
+╒═════════╤═════════════════════╤═════════╤════════════╤═════════╤═══════════════╕
+│ Index   │ Metric              │     Avg │ Unit       │    Peak │   Pct of Peak │
+╞═════════╪═════════════════════╪═════════╪════════════╪═════════╪═══════════════╡
+│ 2.1.15  │ Wavefront Occupancy │ 3291.76 │ Wavefronts │ 3328.00 │         98.91 │
+╘═════════╧═════════════════════╧═════════╧════════════╧═════════╧═══════════════╛
+
+
+--------------------------------------------------------------------------------
+6. Workgroup Manager (SPI)
+6.2 Workgroup Manager - Resource Allocation
+╒═════════╤════════════════════════════════════════╤═══════╤═══════╤═══════╤════════╕
+│ Index   │ Metric                                 │   Avg │   Min │   Max │ Unit   │
+╞═════════╪════════════════════════════════════════╪═══════╪═══════╪═══════╪════════╡
+│ 6.2.0   │ Not-scheduled Rate (Workgroup Manager) │  7.72 │  7.72 │  7.72 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.1   │ Not-scheduled Rate (Scheduler-Pipe)    │ 15.17 │ 15.17 │ 15.17 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.2   │ Scheduler-Pipe Stall Rate              │  7.38 │  7.38 │  7.38 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.3   │ Scratch Stall Rate                     │ 39.76 │ 39.76 │ 39.76 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.4   │ Insufficient SIMD Waveslots            │ 26.32 │ 26.32 │ 26.32 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.5   │ Insufficient SIMD VGPRs                │ 26.32 │ 26.32 │ 26.32 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.6   │ Insufficient SIMD SGPRs                │ 25.52 │ 25.52 │ 25.52 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.7   │ Insufficient CU LDS                    │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.8   │ Insufficient CU Barriers               │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.9   │ Reached CU Workgroup Limit             │  0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────────────────────┼───────┼───────┼───────┼────────┤
+│ 6.2.10  │ Reached CU Wavefront Limit             │  0.00 │  0.00 │  0.00 │ Pct    │
+╘═════════╧════════════════════════════════════════╧═══════╧═══════╧═══════╧════════╛
+
+
+--------------------------------------------------------------------------------
+7. Wavefront
+7.1 Wavefront Launch Stats
+╒═════════╤════════════════════╤═══════╤═══════╤═══════╤════════════════╕
+│ Index   │ Metric             │   Avg │   Min │   Max │ Unit           │
+╞═════════╪════════════════════╪═══════╪═══════╪═══════╪════════════════╡
+│ 7.1.5   │ VGPRs              │ 64.00 │ 64.00 │ 64.00 │ Registers      │
+├─────────┼────────────────────┼───────┼───────┼───────┼────────────────┤
+│ 7.1.6   │ AGPRs              │  0.00 │  0.00 │  0.00 │ Registers      │
+├─────────┼────────────────────┼───────┼───────┼───────┼────────────────┤
+│ 7.1.7   │ SGPRs              │ 80.00 │ 80.00 │ 80.00 │ Registers      │
+├─────────┼────────────────────┼───────┼───────┼───────┼────────────────┤
+│ 7.1.8   │ LDS Allocation     │  0.00 │  0.00 │  0.00 │ Bytes          │
+├─────────┼────────────────────┼───────┼───────┼───────┼────────────────┤
+│ 7.1.9   │ Scratch Allocation │ 60.00 │ 60.00 │ 60.00 │ Bytes/workitem │
+╘═════════╧════════════════════╧═══════╧═══════╧═══════╧════════════════╛
+```
+
+Here we see that our wavefront launch stats (7.1) have changed to reflect the metadata seen in the `--save-temps` output.
+Of particular interest, we see:
+  - The SGPR allocation (7.1.7) is 80 registers, slightly more than the 76 requested by the compiler due to allocation granularity, and
+  - We have a ['scratch'](Mspace) i.e., private memory, allocation of 60 bytes per work-item
+
+Analyzing the resource allocation block (6.2) we now see that for the first time, the 'Not-scheduled Rate (Workgroup Manager)' metric (6.2.0) has become non-zero.  This is because the workgroup manager is responsible for management of scratch, which we see also contributes to our occupancy limiters in the 'Scratch Stall Rate' (6.2.3).  We note that the sum of the workgroup manager not-scheduled rate and the scheduler-pipe non-scheduled rate is still $\sim25\%$, as in our previous examples
+
+Next, we see that the scheduler-pipe stall rate (6.2.2), i.e., how often we could not schedule a workgroup to a CU was only about $\sim8\%$.
+This hints that perhaps, our kernel is not _particularly_ occupancy limited by resources, and indeed checking the wave occupancy metric (2.1.15) shows that this kernel is reaching nearly 99% occupancy!
+
+Finally, we inspect the occupancy limiter metrics and see a roughly even split between [waveslots](valu) (6.2.4), [VGPRs](valu) (6.2.5), and [SGPRs](salu) (6.2.6) along with the scratch stalls (6.2.3) previously mentioned.
+
+This is yet another reminder to view occupancy holistically.
+While these metrics tell you why a workgroup cannot be scheduled, they do _not_ tell you what your occupancy was (consult wavefront occupancy) _nor_ whether increasing occupancy will be beneficial to performance.
+
diff --git a/src/docs/profiling.md b/src/docs/profiling.md
index 56c234604..14d212b2c 100644
--- a/src/docs/profiling.md
+++ b/src/docs/profiling.md
@@ -37,7 +37,7 @@ Releasing CPU memory
 ```
 
 ## Omniperf Profiling
-The *omniperf* script, availible through the [Omniperf](https://github.com/AMDResearch/omniperf) repository, is used to aquire all necessary perfmon data through analysis of compute workloads.
+The *omniperf* script, available through the Omniperf repository, is used to aquire all necessary performance monitoring data through analysis of compute workloads.
 
 **omniperf help:**
 ```shell-session
@@ -80,7 +80,7 @@ Profile Options:
   -p , --path                                           Specify path to save workload.
                                                         (DEFAULT: /home/colramos/GitHub/omniperf/workloads/<name>)
   -k  [ ...], --kernel  [ ...]                          Kernel filtering.
-  -b  [ ...], --ipblocks  [ ...]                        IP block filtering:
+  -b  [ ...], --ipblocks  [ ...]                        Hardware block filtering:
                                                            SQ
                                                            SQC
                                                            TA
@@ -108,6 +108,13 @@ Standalone Roofline Options:
   --kernel-names                                        Include kernel names in roofline plot.
 ```
 
+- The `-k` \<kernel> flag allows for kernel filtering, which is compatible with the current rocProf utility.
+
+- The `-d` \<dispatch> flag allows for dispatch ID filtering,  which is compatible with the current rocProf utility.
+
+- The `-b` \<ipblocks> allows system profiling on one or more selected hardware components to speed up the profiling process. One can gradually include more hardware components, without overwriting performance data acquired on other hardware components.
+
+
 The following sample command profiles the *vcopy* workload.
 
 **vcopy profiling:**
@@ -116,7 +123,6 @@ $ omniperf profile --name vcopy -- ./vcopy 1048576 256
 Resolving rocprof
 ROC Profiler:  /usr/bin/rocprof
 
-
 -------------
 Profile only
 -------------
@@ -152,7 +158,7 @@ Finished executing kernel
 Finished copying the output vector from the GPU to the CPU
 Releasing GPU memory
 Releasing CPU memory
- 
+
 ... ...
 ROCPRofiler: 1 contexts collected, output directory /tmp/rpl_data_220527_130317_1787038/input_results_220527_130317
 File 'workloads/vcopy/mi200/timestamps.csv' is generating
@@ -204,16 +210,16 @@ Peak MFMA FLOPs (F64), GPU ID: 1, workgroupSize:256, workgroups:16384, experimen
  99% [||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ]
 Peak MFMA IOPs (I8), GPU ID: 1, workgroupSize:256, workgroups:16384, experiments:100, IOP:2147483648000, duration:14.3 ms, mean:150317.8 GOPS, stdev=203.5 GOPS
 ```
-You'll notice two stages in *default* Omniperf profiling. The first stage collects all the counters needed for Omniperf analysis (omitting any filters you've provided). The second stage collects data for the roofline analysis (this stage can be disabled using `--no-roof`)
+You will notice two stages in *default* Omniperf profiling. The first stage collects all the counters needed for Omniperf analysis (omitting any filters you have provided). The second stage collects data for the roofline analysis (this stage can be disabled using `--no-roof`)
 
-At the end of the profiling, all resulting csv files should be located in a SOC specific target directory, e.g.:
-  - "mi200" for the AMD Instinct (tm) MI-200 family of accelerators
-  - "mi100" for the AMD Instinct (tm) MI-100 family of accelerators
-etc.  The SOC names are generated as a part of Omniperf, and do not necessarily distinguish between different accelerators in the same family (e.g., an AMD Instinct (tm) MI-210 vs an MI-250)
+In this document, we use the term System on Chip (SoC) to refer to a particular family of accelerators. At the end of profiling, all resulting csv files should be located in a SoC specific target directory, e.g.:
+  - "mi200" for the AMD Instinct (tm) MI200 family of accelerators
+  - "mi100" for the AMD Instinct (tm) MI100 family of accelerators
+etc.  The SoC names are generated as a part of Omniperf, and do not necessarily distinguish between different accelerators in the same family (e.g., an AMD Instinct (tm) MI210 vs an MI250)
 
-> Note: Additionally, you'll notice a few extra files. An SoC parameters file, *sysinfo.csv*, is created to reflect the target device settings. All profiling output is stored in *log.txt*. Roofline specific benchmark results are stored in *roofline.csv*.
+> Note: Additionally, you will notice a few extra files. An SoC parameters file, *sysinfo.csv*, is created to reflect the target device settings. All profiling output is stored in *log.txt*. Roofline specific benchmark results are stored in *roofline.csv*.
 
-```shell
+```shell-session
 $ ls workloads/vcopy/mi200/
 total 112
 drwxrwxr-x 3 colramos colramos  4096 Apr 11 16:42 .
@@ -232,17 +238,17 @@ drwxrwxr-x 2 colramos colramos  4096 Apr 11 16:42 perfmon
 ```
 
 ### Filtering
-To reduce profiling time and the counters collected one may use profiling filters. Profiling filters and their functionality depend on the underlying profiler being used. While Omniperf is profiler agnostic, we've provided a detailed description of profiling filters available when using Omniperf with [rocProfiler](https://rocm.docs.amd.com/projects/rocprofiler/en/latest/rocprof.html) below.
+To reduce profiling time and the counters collected one may use profiling filters. Profiling filters and their functionality depend on the underlying profiler being used. While Omniperf is profiler agnostic, we have provided a detailed description of profiling filters available when using Omniperf with [rocProf](https://rocm.docs.amd.com/projects/rocprofiler/en/latest/rocprof.html) below.
 
 
 
 Filtering Options:
 
-- The `-k` \<kernel> flag allows for kernel filtering. Useage is equivalent with the current rocprof utility ([see details below](#kernel-filtering)).
+- The `-k` \<kernel> flag allows for kernel filtering. Useage is equivalent with the current rocProf utility ([see details below](#kernel-filtering)).
 
-- The `-d` \<dispatch> flag allows for dispatch ID filtering. Useage is equivalent with the current rocprof utility ([see details below](#dispatch-filtering)).
+- The `-d` \<dispatch> flag allows for dispatch ID filtering. Useage is equivalent with the current rocProf utility ([see details below](#dispatch-filtering)).
 
-- The `-b` \<ipblocks> allows system profiling on one or more selected IP blocks to speed up the profiling process. One can gradually incorporate more IP blocks, without overwriting performance data acquired on other IP blocks.
+- The `-b` \<ipblocks> allows system profiling on one or more selected hardware components to speed up the profiling process. One can gradually include more hardware components, without overwriting performance data acquired on other hardware components.
 
 ```{note}
 Be cautious while combining different profiling filters in the same call. Conflicting filters may result in error.
@@ -250,11 +256,11 @@ Be cautious while combining different profiling filters in the same call. Confli
 i.e. filtering dispatch X, but dispatch X does not match your kernel name filter
 ```
 
-#### IP Block Filtering
-One can profile a selected IP Block to speed up the profiling process. All profiling results are accumulated in the same target directory, without overwriting those for other IP blocks, hence enabling the incremental profiling and analysis.
+#### Hardware Component Filtering
+One can profile specific hardware components to speed up the profiling process. In Omniperf, we use the term IP block to refer to a hardware component or a group of hardware components. All profiling results are accumulated in the same target directory, without overwriting those for other hardware components, hence enabling the incremental profiling and analysis.
 
-The following example only gathers hardware counters for SQ and TCC, skipping all other IP Blocks:
-```shell
+The following example only gathers hardware counters for the Shader Sequencer (SQ) and L2 Cache (TCC) components, skipping all other hardware components:
+```shell-session
 $ omniperf profile --name vcopy -b SQ TCC -- ./sample/vcopy 1048576 256
 Resolving rocprof
 ROC Profiler:  /usr/bin/rocprof
@@ -291,15 +297,14 @@ Log:  /home/colramos/GitHub/omniperf-pub/workloads/vcopy/mi200/log.txt
 ```
 
 #### Kernel Filtering
-Kernel filtering is based on the name of the kernel(s) you'd like to isolate. Use a kernel name substring list to isolate desired kernels.
+Kernel filtering is based on the name of the kernel(s) you would like to isolate. Use a kernel name substring list to isolate desired kernels.
 
 The following example demonstrates profiling isolating the kernel matching substring "vecCopy":
-```shell
+```shell-session
 $ omniperf profile --name vcopy -k vecCopy -- ./vcopy 1048576 256
 Resolving rocprof
 ROC Profiler:  /usr/bin/rocprof
 
-
 -------------
 Profile only
 -------------
@@ -323,7 +328,7 @@ Finished allocating vectors on the CPU
 ROCProfiler: input from "/tmp/rpl_data_230411_170300_29696/input0.xml"
   gpu_index = 
   kernel = vecCopy
- 
+
 ... ...
 ```
 
@@ -336,7 +341,6 @@ $ omniperf profile --name vcopy -d 0 -- ./vcopy 1048576 256
 Resolving rocprof
 ROC Profiler:  /usr/bin/rocprof
 
-
 -------------
 Profile only
 -------------
@@ -365,19 +369,18 @@ ROCProfiler: input from "/tmp/rpl_data_230411_170356_30314/input0.xml"
 ```
 
 
-
 ### Standalone Roofline
-If you're only interested in generating roofline analysis data try using `--roof-only`. This will only collect counters relevent to roofline, as well as generate a standalone .pdf output of your roofline plot. 
+If you are only interested in generating roofline analysis data try using `--roof-only`. This will only collect counters relevant to roofline, as well as generate a standalone .pdf output of your roofline plot. 
 
 Standalone Roofline Options:
 
-- The `--sort` \<desired_sort> allows you to specify whether you'd like to overlay top kernel or top dispatch data in your roofline plot.
+- The `--sort` \<desired_sort> allows you to specify whether you would like to overlay top kernel or top dispatch data in your roofline plot.
 
-- The `-m` \<cache_level> allows you to specify specific level(s) of cache you'd like to include in your roofline plot.
+- The `-m` \<cache_level> allows you to specify specific level(s) of cache you would like to include in your roofline plot.
 
 - The `--device` \<gpu_id> allows you to specify a device id to collect performace data from when running our roofline benchmark on your system.
 
-- If you'd like to distinguish different kernels in your .pdf roofline plot use `--kernel-names`. This will give each kernel a unique marker identifiable from the plot's key.
+- If you would like to distinguish different kernels in your .pdf roofline plot use `--kernel-names`. This will give each kernel a unique marker identifiable from the plot's key.
 
 
 #### Roofline Only
@@ -422,4 +425,4 @@ drwxrwxr-x 2 colramos colramos  4096 Apr 11 17:16 perfmon
 ```
 A sample *empirRoof_gpu-ALL_fp32.pdf* looks something like this:
 
-![Sample Standalone Roof Plot](images/sample-roof-plot.png)
\ No newline at end of file
+![Sample Standalone Roof Plot](images/sample-roof-plot.png)
diff --git a/src/omniperf_analyze/configs/gfx906/0200_system-speed-of-light.yaml b/src/omniperf_analyze/configs/gfx906/0200_system-speed-of-light.yaml
index 986b2f0ae..82e59c997 100644
--- a/src/omniperf_analyze/configs/gfx906/0200_system-speed-of-light.yaml
+++ b/src/omniperf_analyze/configs/gfx906/0200_system-speed-of-light.yaml
@@ -14,10 +14,10 @@ Panel Config:
         title: Speed-of-Light
         header:
           metric: Metric
-          value: Value
+          value: Avg
           unit: Unit
           peak: Peak
-          pop: PoP
+          pop: Pct of Peak
           tips: Tips
         metric:
           VALU FLOPs:
@@ -28,7 +28,7 @@ Panel Config:
             tips:
           VALU IOPs:
             value: None # No perf counter
-            unit: GOPs
+            unit: GIOPs
             peak: (((($sclk * $numCU) * 64) * 2) / 1000)
             pop: None # No perf counter
             tips:
@@ -68,25 +68,37 @@ Panel Config:
             peak: $numCU
             pop: ((100 * $numActiveCUs) / $numCU)
             tips: 
-          SALU Util:
+          SALU Utilization:
             value: AVG(((100 * SQ_ACTIVE_INST_SCA) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             peak: 100
             pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / (GRBM_GUI_ACTIVE * $numCU)))
             tips: 
-          VALU Util:
+          VALU Utilization:
             value: AVG(((100 * SQ_ACTIVE_INST_VALU) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             peak: 100
             pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / (GRBM_GUI_ACTIVE * $numCU)))
             tips:
-          MFMA Util:
+          MFMA Utilization:
             value: None # No HW module
             unit: pct
             peak: 100
             pop: None # No HW module
             tips:
-          VALU Active Threads/Wave:
+          VMEM Utilization:
+            value: None # No HW module
+            unit: pct
+            peak: 100
+            pop: None # No HW module
+            tips: 
+          Branch Utilization:
+            value: None # No HW module
+            unit: pct
+            peak: 100
+            pop: None # No HW module
+            tips: 
+          VALU Active Threads:
             value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
               != 0) else None))
             unit: Threads
@@ -94,25 +106,29 @@ Panel Config:
             pop: (AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
               != 0) else None)) * 1.5625)
             tips: 
-          IPC - Issue:
-            value: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)
-              + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED)
-              / SQ_ACTIVE_INST_ANY))
+          IPC:
+            value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
             unit: Instr/cycle
             peak: 5
-            pop: ((100 * AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)
-              + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED)
-              / SQ_ACTIVE_INST_ANY))) / 5)
+            pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
+            tips: 
+          Wavefront Occupancy:
+            value: AVG((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE))
+            unit: Wavefronts
+            peak: ($maxWavesPerCU * $numCU)
+            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE) / ($maxWavesPerCU
+              * $numCU))))
+            coll_level: SQ_LEVEL_WAVES
             tips: 
-          LDS BW:
+          Theoretical LDS Bandwidth:
             value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks))
               / (EndNs - BeginNs)))
-            unit: GB/sec
+            unit: GB/s
             peak: (($sclk * $numCU) * 0.128)
             pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks))
               / (EndNs - BeginNs)) / (($sclk * $numCU) * 0.00128)))
             tips: 
-          LDS Bank Conflict:
+          LDS Bank Conflicts/Access:
             value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
               if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
             unit: Conflicts/access
@@ -120,35 +136,7 @@ Panel Config:
             pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
               if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32)
             tips: 
-          Instr Cache Hit Rate:
-            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-            tips: 
-          Instr Cache BW:
-            value: AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64))
-            unit: GB/s
-            peak: ((($sclk / 1000) * 64) * $numSQC)
-            pop: ((100 * AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk
-              / 1000) * 64) * $numSQC))
-            tips: 
-          Scalar L1D Cache Hit Rate:
-            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-            tips: 
-          Scalar L1D Cache BW:
-            value: AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))
-            unit: GB/s
-            peak: ((($sclk / 1000) * 64) * $numSQC)
-            pop: ((100 * AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk
-              / 1000) * 64) * $numSQC))
-            tips: 
-          Vector L1D Cache Hit Rate:
+          vL1D Cache Hit Rate:
             value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
               + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
               / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
@@ -160,7 +148,7 @@ Panel Config:
               TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
               None))
             tips: 
-          Vector L1D Cache BW:
+          vL1D Cache BW:
             value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs)))
             unit: GB/s
             peak: ((($sclk / 1000) * 64) * $numCU)
@@ -175,6 +163,13 @@ Panel Config:
             pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
               + TCC_MISS_sum) != 0) else None))
             tips: 
+          L2 Cache BW:
+            value: AVG(((TCC_REQ_sum * 64) / (EndNs - BeginNs)))
+            unit: GB/s
+            peak: ((($sclk / 1000) * 64) * TO_INT($L2Banks))
+            pop: ((100 * AVG(((TCC_REQ_sum * 64) / (EndNs - BeginNs))))
+              / ((($sclk / 1000) * 64) * TO_INT($L2Banks)))
+            tips: 
           L2-Fabric Read BW:
             value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
               * 64)) / (EndNs - BeginNs)))
@@ -195,36 +190,48 @@ Panel Config:
             value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
               != 0) else None))
             unit: Cycles
-            peak: ''
-            pop: ''
+            peak: None
+            pop: None
             tips: 
           L2-Fabric Write Latency:
             value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
               != 0) else None))
             unit: Cycles
-            peak: ''
-            pop: ''
+            peak: None
+            pop: None
             tips: 
-          Wave Occupancy:
-            value: AVG((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE))
-            unit: Wavefronts
-            peak: ($maxWavesPerCU * $numCU)
-            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE) / ($maxWavesPerCU
-              * $numCU))))
-            coll_level: SQ_LEVEL_WAVES
+          sL1D Cache Hit Rate:
+            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
+              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
+            unit: pct
+            peak: 100
+            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
+              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
             tips: 
-          Instr Fetch BW:
-            value: AVG(((SQ_IFETCH / (EndNs - BeginNs)) * 32))
+          sL1D Cache BW:
+            value: AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))
             unit: GB/s
-            peak: ((($sclk / 1000) * 32) * $numSQC)
-            pop: ((100 * AVG(((SQ_IFETCH / (EndNs - BeginNs)) * 32))) / ($numSQC
-              * (($sclk / 1000) * 32)))
-            coll_level: SQ_IFETCH_LEVEL
+            peak: ((($sclk / 1000) * 64) * $numSQC)
+            pop: ((100 * AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk
+              / 1000) * 64) * $numSQC))
+            tips: 
+          L1I Hit Rate:
+            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
+            unit: pct
+            peak: 100
+            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
+            tips: 
+          L1I BW:
+            value: AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 32))
+            unit: GB/s
+            peak: ((($sclk / 1000) * 64) * $numSQC)
+            pop: ((100 * AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 32))) / ((($sclk
+              / 1000) * 32) * $numSQC))
             tips: 
-          Instr Fetch Latency:
+          L1I Fetch Latency:
             value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
             unit: Cycles
-            peak: ''
-            pop: ''
+            peak: None
+            pop: None
             coll_level: SQ_IFETCH_LEVEL
             tips:
diff --git a/src/omniperf_analyze/configs/gfx906/0500_command-processor.yaml b/src/omniperf_analyze/configs/gfx906/0500_command-processor.yaml
index 525091879..edd42da6e 100644
--- a/src/omniperf_analyze/configs/gfx906/0500_command-processor.yaml
+++ b/src/omniperf_analyze/configs/gfx906/0500_command-processor.yaml
@@ -19,19 +19,7 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          GPU Busy Cycles:
-            avg: AVG(GRBM_GUI_ACTIVE)
-            min: MIN(GRBM_GUI_ACTIVE)
-            max: MAX(GRBM_GUI_ACTIVE)
-            unit: Cycles/Kernel
-            tips: 
-          CPF Busy:
-            avg: AVG(CPF_CPF_STAT_BUSY)
-            min: MIN(CPF_CPF_STAT_BUSY)
-            max: MAX(CPF_CPF_STAT_BUSY)
-            unit: Cycles/Kernel
-            tips: 
-          CPF Util:
+          CPF Utilization:
             avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
               if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
             min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
@@ -47,15 +35,9 @@ Panel Config:
               != 0) else None))
             max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
               != 0) else None))
-            unit: Cycles/Kernel
-            tips: 
-          L2Cache Intf Busy:
-            avg: AVG(CPF_CPF_TCIU_BUSY)
-            min: MIN(CPF_CPF_TCIU_BUSY)
-            max: MAX(CPF_CPF_TCIU_BUSY)
-            unit: Cycles/Kernel
+            unit: pct
             tips: 
-          L2Cache Intf Util:
+          CPF-L2 Utilization:
             avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
               if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
             min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
@@ -64,7 +46,7 @@ Panel Config:
               if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
             unit: pct
             tips: 
-          L2Cache Intf Stall:
+          CPF-L2 Stall:
             avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
               != 0) else None))
             min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
@@ -73,11 +55,14 @@ Panel Config:
               != 0) else None))
             unit: pct
             tips: 
-          UTCL1 Stall:
-            avg: AVG(CPF_CMP_UTCL1_STALL_ON_TRANSLATION)
-            min: MIN(CPF_CMP_UTCL1_STALL_ON_TRANSLATION)
-            max: MAX(CPF_CMP_UTCL1_STALL_ON_TRANSLATION)
-            unit: Cycles/Kernel
+          CPF-UTCL1 Stall:
+            avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
+              != 0) else None)
+            min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
+              != 0) else None)
+            max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
+              != 0) else None)
+            unit: pct
             tips:
 
     - metric_table:
@@ -91,19 +76,7 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          GPU Busy Cycles:
-            avg: AVG(GRBM_GUI_ACTIVE)
-            min: MIN(GRBM_GUI_ACTIVE)
-            max: MAX(GRBM_GUI_ACTIVE)
-            unit: Cycles
-            tips: 
-          CPC Busy Cycles:
-            avg: AVG(CPC_CPC_STAT_BUSY)
-            min: MIN(CPC_CPC_STAT_BUSY)
-            max: MAX(CPC_CPC_STAT_BUSY)
-            unit: Cycles
-            tips: 
-          CPC Util:
+          CPC Utilization:
             avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
               if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
             min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
@@ -112,12 +85,6 @@ Panel Config:
               if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
             unit: pct
             tips: 
-          CPC Stall Cycles:
-            avg: AVG(CPC_CPC_STAT_STALL)
-            min: MIN(CPC_CPC_STAT_STALL)
-            max: MAX(CPC_CPC_STAT_STALL)
-            unit: Cycles
-            tips: 
           CPC Stall Rate:
             avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
               != 0) else None))
@@ -127,28 +94,19 @@ Panel Config:
               != 0) else None))
             unit: pct
             tips: 
-          CPC Packet Decoding:
-            avg: AVG(CPC_ME1_BUSY_FOR_PACKET_DECODE)
-            min: MIN(CPC_ME1_BUSY_FOR_PACKET_DECODE)
-            max: MAX(CPC_ME1_BUSY_FOR_PACKET_DECODE)
-            unit: Cycles
-            tips: 
-          SPI Intf Busy Cycles:
-            avg: AVG(CPC_ME1_DC0_SPI_BUSY)
-            min: MIN(CPC_ME1_DC0_SPI_BUSY)
-            max: MAX(CPC_ME1_DC0_SPI_BUSY)
-            unit: Cycles
-            tips: 
-          SPI Intf Util:
-            avg: AVG((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
+          CPC Packet Decoding Utilization:
+            avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
+            min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
+            max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
             unit: pct
             tips: 
-          L2Cache Intf Util:
+          CPC-Workgroup Manager Utilization:
+            avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
+            min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
+            max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
+            unit: Pct
+            tips: 
+          CPC-L2 Utilization:
             avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
               if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
             min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
@@ -157,19 +115,16 @@ Panel Config:
               if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
             unit: pct
             tips: 
-          UTCL1 Stall Cycles:
-            avg: AVG(CPC_UTCL1_STALL_ON_TRANSLATION)
-            min: MIN(CPC_UTCL1_STALL_ON_TRANSLATION)
-            max: MAX(CPC_UTCL1_STALL_ON_TRANSLATION)
-            unit: Cycles
-            tips: 
-          UTCL2 Intf Busy Cycles:
-            avg: AVG(CPC_CPC_UTCL2IU_BUSY)
-            min: MIN(CPC_CPC_UTCL2IU_BUSY)
-            max: MAX(CPC_CPC_UTCL2IU_BUSY)
-            unit: Cycles
+          CPC-UTCL1 Stall:
+            avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
+              != 0) else None)
+            min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
+              != 0) else None)
+            max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
+              != 0) else None)
+            unit: pct
             tips: 
-          UTCL2 Intf Util:
+          CPC-UTCL2 Utilization:
             avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
               if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
             min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
diff --git a/src/omniperf_analyze/configs/gfx906/0600_shader-processor-input.yaml b/src/omniperf_analyze/configs/gfx906/0600_shader-processor-input.yaml
index bab48700a..24d4036ec 100644
--- a/src/omniperf_analyze/configs/gfx906/0600_shader-processor-input.yaml
+++ b/src/omniperf_analyze/configs/gfx906/0600_shader-processor-input.yaml
@@ -6,11 +6,11 @@ Metric Description:
 # Define the panel properties and properties of each metric in the panel.
 Panel Config:
   id: 600
-  title: Shader Processor Input (SPI)
+  title: Workgroup Manager (SPI)
   data source:
     - metric_table:
         id: 601
-        title: SPI Stats
+        title: Workgroup Manager Utilizations
         header:
           metric: Metric
           avg: Avg
@@ -19,29 +19,35 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          GPU Busy:
-            avg: AVG(GRBM_GUI_ACTIVE)
-            min: MIN(GRBM_GUI_ACTIVE)
-            max: MAX(GRBM_GUI_ACTIVE)
-            unit: Cycles
-            tips: 
-          CS Busy:
-            avg: AVG(SPI_CSN_BUSY)
-            min: MIN(SPI_CSN_BUSY)
-            max: MAX(SPI_CSN_BUSY)
-            unit: Cycles
-            tips: 
-          SPI Busy:
-            avg: AVG(GRBM_SPI_BUSY)
-            min: MIN(GRBM_SPI_BUSY)
-            max: MAX(GRBM_SPI_BUSY)
-            unit: Cycles
-            tips: 
-          SQ Busy:
-            avg: AVG(SQ_BUSY_CYCLES)
-            min: MIN(SQ_BUSY_CYCLES)
-            max: MAX(SQ_BUSY_CYCLES)
-            unit: Cycles
+          Accelerator Utilization:
+            avg: AVG(100 * GRBM_GUI_ACTIVE / GRBM_COUNT)
+            min: MIN(100 * GRBM_GUI_ACTIVE / GRBM_COUNT)
+            max: MAX(100 * GRBM_GUI_ACTIVE / GRBM_COUNT)
+            unit: Pct
+            tips: 
+          Scheduler-Pipe Utilization:
+            avg: AVG(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE))
+            min: MIN(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE))
+            max: MAX(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE))
+            unit: Pct
+            tips: 
+          Workgroup Manager Utilization:
+            avg: AVG(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE)
+            min: MIN(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE)
+            max: MAX(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE)
+            unit: Pct
+            tips: 
+          Shader Engine Utilization:
+            avg: AVG(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE))
+            min: MIN(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE))
+            max: MAX(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE))
+            unit: Pct
+            tips: 
+          SIMD Utilization:
+            avg: AVG(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
             tips: 
           Dispatched Workgroups:
             avg: AVG(SPI_CSN_NUM_THREADGROUPS)
@@ -55,22 +61,27 @@ Panel Config:
             max: MAX(SPI_CSN_WAVE)
             unit: Wavefronts
             tips: 
-          Wave Alloc Failed:
-            avg: AVG(SPI_RA_REQ_NO_ALLOC)
-            min: MIN(SPI_RA_REQ_NO_ALLOC)
-            max: MAX(SPI_RA_REQ_NO_ALLOC)
-            unit: Cycles
-            tips: 
-          Wave Alloc Failed - CS:
-            avg: AVG(SPI_RA_REQ_NO_ALLOC_CSN)
-            min: MIN(SPI_RA_REQ_NO_ALLOC_CSN)
-            max: MAX(SPI_RA_REQ_NO_ALLOC_CSN)
-            unit: Cycles
+          VGPR Writes:
+            avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            unit: Cycles/wave
             tips: 
-
+          SGPR Writes:
+            avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            unit: Cycles/wave
+            tips:
     - metric_table:
         id: 602
-        title: SPI Resource Allocation
+        title: Workgroup Manager - Resource Allocation
         header:
           metric: Metric
           avg: Avg
@@ -79,96 +90,78 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          Wave request Failed (CS):
-            avg: AVG(SPI_RA_REQ_NO_ALLOC_CSN)
-            min: MIN(SPI_RA_REQ_NO_ALLOC_CSN)
-            max: MAX(SPI_RA_REQ_NO_ALLOC_CSN)
-            unit: Cycles
-            tips: 
-          CS Stall:
-            avg: AVG(SPI_RA_RES_STALL_CSN)
-            min: MIN(SPI_RA_RES_STALL_CSN)
-            max: MAX(SPI_RA_RES_STALL_CSN)
-            unit: Cycles
-            tips: 
-          CS Stall Rate:
-            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY !=
+          Not-scheduled Rate (Workgroup Manager):
+            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            unit: Pct
+            tips: 
+          Not-scheduled Rate (Scheduler-Pipe):
+            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            min: MIN((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            max: MAX((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            unit: Pct
+            tips: 
+          Scheduler-Pipe Stall Rate:
+            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
               0) else None))
-            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY !=
+            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
               0) else None))
-            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY !=
+            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
               0) else None))
-            unit: pct
+            unit: Pct
             tips: 
-          Scratch Stall:
-            avg: AVG(SPI_RA_TMP_STALL_CSN)
-            min: MIN(SPI_RA_TMP_STALL_CSN)
-            max: MAX(SPI_RA_TMP_STALL_CSN)
-            unit: Cycles
+          Scratch Stall Rate:
+            avg: AVG((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)
+            min: MIN((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)
+            max: MAX((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)
+            unit: Pct
             tips: 
           Insufficient SIMD Waveslots:
-            avg: AVG(SPI_RA_WAVE_SIMD_FULL_CSN)
-            min: MIN(SPI_RA_WAVE_SIMD_FULL_CSN)
-            max: MAX(SPI_RA_WAVE_SIMD_FULL_CSN)
-            unit: SIMD
+            avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
             tips: 
           Insufficient SIMD VGPRs:
-            avg: AVG(SPI_RA_VGPR_SIMD_FULL_CSN)
-            min: MIN(SPI_RA_VGPR_SIMD_FULL_CSN)
-            max: MAX(SPI_RA_VGPR_SIMD_FULL_CSN)
-            unit: SIMD
+            avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
             tips: 
           Insufficient SIMD SGPRs:
-            avg: AVG(SPI_RA_SGPR_SIMD_FULL_CSN)
-            min: MIN(SPI_RA_SGPR_SIMD_FULL_CSN)
-            max: MAX(SPI_RA_SGPR_SIMD_FULL_CSN)
-            unit: SIMD
+            avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
             tips: 
           Insufficient CU LDS:
-            avg: AVG(SPI_RA_LDS_CU_FULL_CSN)
-            min: MIN(SPI_RA_LDS_CU_FULL_CSN)
-            max: MAX(SPI_RA_LDS_CU_FULL_CSN)
-            unit: CU
-            tips: 
-          Insufficient CU Barries:
-            avg: AVG(SPI_RA_BAR_CU_FULL_CSN)
-            min: MIN(SPI_RA_BAR_CU_FULL_CSN)
-            max: MAX(SPI_RA_BAR_CU_FULL_CSN)
-            unit: CU
-            tips: 
-          Insufficient Bulky Resource:
-            avg: AVG(SPI_RA_BULKY_CU_FULL_CSN)
-            min: MIN(SPI_RA_BULKY_CU_FULL_CSN)
-            max: MAX(SPI_RA_BULKY_CU_FULL_CSN)
-            unit: CU
-            tips: 
-          Reach CU Threadgroups Limit:
-            avg: AVG(SPI_RA_TGLIM_CU_FULL_CSN)
-            min: MIN(SPI_RA_TGLIM_CU_FULL_CSN)
-            max: MAX(SPI_RA_TGLIM_CU_FULL_CSN)
-            unit: Cycles
-            tips: 
-          Reach CU Wave Limit:
-            avg: AVG(SPI_RA_WVLIM_STALL_CSN)
-            min: MIN(SPI_RA_WVLIM_STALL_CSN)
-            max: MAX(SPI_RA_WVLIM_STALL_CSN)
-            unit: Cycles
+            avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
+            tips: 
+          Insufficient CU Barriers:
+            avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
+            tips: 
+          Reached CU Workgroup Limit:
+            avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
+            tips: 
+          Reached CU Wavefront Limit:
+            avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
             tips: 
-          VGPR Writes:
-            avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            unit: Cycles/wave
-            tips: 
-          SGPR Writes:
-            avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            unit: Cycles/wave
-            tips:
diff --git a/src/omniperf_analyze/configs/gfx906/0700_wavefront-launch.yaml b/src/omniperf_analyze/configs/gfx906/0700_wavefront-launch.yaml
index 70141193e..abcaae418 100644
--- a/src/omniperf_analyze/configs/gfx906/0700_wavefront-launch.yaml
+++ b/src/omniperf_analyze/configs/gfx906/0700_wavefront-launch.yaml
@@ -77,7 +77,7 @@ Panel Config:
             avg: AVG(scr)
             min: MIN(scr)
             max: MAX(scr)
-            unit: Bytes
+            unit: Bytes/Workitem
             tips:
 
     - metric_table:
@@ -103,7 +103,7 @@ Panel Config:
             max: MAX(GRBM_GUI_ACTIVE)
             unit: Cycle
             tips: 
-          Instr/wavefront:
+          Instructions per wavefront:
             avg: AVG((SQ_INSTS / SQ_WAVES))
             min: MIN((SQ_INSTS / SQ_WAVES))
             max: MAX((SQ_INSTS / SQ_WAVES))
diff --git a/src/omniperf_analyze/configs/gfx906/1000_compute-unit-instruction-mix.yaml b/src/omniperf_analyze/configs/gfx906/1000_compute-unit-instruction-mix.yaml
index 679acc34d..0092c202c 100644
--- a/src/omniperf_analyze/configs/gfx906/1000_compute-unit-instruction-mix.yaml
+++ b/src/omniperf_analyze/configs/gfx906/1000_compute-unit-instruction-mix.yaml
@@ -10,7 +10,7 @@ Panel Config:
   data source:
     - metric_table:
         id: 1001
-        title: Instruction Mix
+        title: Overall Instruction Mix
         header:
           metric: Metric
           avg: Avg
@@ -22,7 +22,7 @@ Panel Config:
           type: simple_bar
           label_txt: (# of instr + $normUnit)
         metric:
-          VALU - Vector:
+          VALU:
             avg: None # No HW module
             min: None # No HW module
             max: None # No HW module
@@ -40,7 +40,7 @@ Panel Config:
             max: MAX((SQ_INSTS_LDS / $denom))
             unit: (instr + $normUnit)
             tips: 
-          VALU - MFMA:
+          MFMA:
             avg: None # No HW module
             min: None # No HW module
             max: None # No HW module
@@ -64,12 +64,6 @@ Panel Config:
             max: MAX((SQ_INSTS_BRANCH / $denom))
             unit: (instr + $normUnit)
             tips: 
-          GDS:
-            avg: AVG((SQ_INSTS_GDS / $denom))
-            min: MIN((SQ_INSTS_GDS / $denom))
-            max: MAX((SQ_INSTS_GDS / $denom))
-            unit: (instr + $normUnit)
-            tips: 
 
     - metric_table:
         id: 1002
@@ -103,7 +97,7 @@ Panel Config:
             max: None # No HW module
             unit: (instr + $normUnit)
             tips:
-          F16-Mult:
+          F16-MUL:
             avg: None # No HW module
             min: None # No HW module
             max: None # No HW module
@@ -127,7 +121,7 @@ Panel Config:
             max: None # No HW module
             unit: (instr + $normUnit)
             tips:
-          F32-Mult:
+          F32-MUL:
             avg: None # No HW module
             min: None # No HW module
             max: None # No HW module
@@ -151,7 +145,7 @@ Panel Config:
             max: None # No HW module
             unit: (instr + $normUnit)
             tips:
-          F64-Mult:
+          F64-MUL:
             avg: None # No HW module
             min: None # No HW module
             max: None # No HW module
@@ -180,55 +174,100 @@ Panel Config:
         id: 1003
         title: VMEM Instr Mix
         header:
-          type: Type
-          count: Count
+          metric: Metric
+          avg: Avg
+          min: Min
+          max: Max
+          unit: Unit
           tips: Tips
         metric:
-          Buffer Instr:
-            count: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            tips: 
-          Buffer Read:
-            count: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            tips: 
-          Buffer Write:
-            count: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            tips: 
-          Buffer Atomic:
-            count: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            tips: 
-          Flat Instr:
-            count: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
-            tips: 
-          Flat Read:
-            count: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            tips: 
-          Flat Write:
-            count: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            tips: 
-          Flat Atomic:
-            count: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            tips: 
+          Global/Generic Instr:
+            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
+          Global/Generic Read:
+            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
+          Global/Generic Write:
+            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
+          Global/Generic Atomic:
+            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
+          Spill/Stack Instr:
+            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
+          Spill/Stack Read:
+            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
+          Spill/Stack Write:
+            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
+          Spill/Stack Atomic:
+            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
 
     - metric_table:
         id: 1004
         title: MFMA Arithmetic Instr Mix
         header:
-          type: Type
-          count: Count
+          metric: Metric
+          avg: Avg
+          min: Min
+          max: Max
+          unit: Unit
           tips: Tips
         metric:
           MFMA-I8:
-            count: None # No HW module
-            tips:
+            avg: None # No HW module
+            min: None # No HW module
+            max: None # No HW module
+            unit: (instr + $normUnit)
+            tips: 
           MFMA-F16:
-            count: None # No HW module
-            tips:
+            avg: None # No HW module
+            min: None # No HW module
+            max: None # No HW module
+            unit: (instr + $normUnit)
+            tips: 
           MFMA-BF16:
-            count: None # No HW module
-            tips:
+            avg: None # No HW module
+            min: None # No HW module
+            max: None # No HW module
+            unit: (instr + $normUnit)
+            tips: 
           MFMA-F32:
-            count: None # No HW module
+            avg: None # No HW module
+            min: None # No HW module
+            max: None # No HW module
+            unit: (instr + $normUnit)
             tips:
           MFMA-F64:
-            count: None # No HW module
-            tips:
+            avg: None # No HW module
+            min: None # No HW module
+            max: None # No HW module
+            unit: (instr + $normUnit)
+            tips:
\ No newline at end of file
diff --git a/src/omniperf_analyze/configs/gfx906/1100_compute-unit-compute-pipeline.yaml b/src/omniperf_analyze/configs/gfx906/1100_compute-unit-compute-pipeline.yaml
index 4ea952637..63019bfec 100644
--- a/src/omniperf_analyze/configs/gfx906/1100_compute-unit-compute-pipeline.yaml
+++ b/src/omniperf_analyze/configs/gfx906/1100_compute-unit-compute-pipeline.yaml
@@ -13,7 +13,10 @@ Panel Config:
         title: Speed-of-Light
         header:
           metric: Metric
-          value: Value
+          value: Avg
+          unit: Unit
+          peak: Peak
+          pop: Pct of Peak
           tips: Tips
         style:
           type: simple_bar
@@ -21,23 +24,47 @@ Panel Config:
           label_txt: (%)
           xrange: [0, 110]
         metric:
-          valu_flops_pop:
+          VALU FLOPs:
+            value: None # No perf counter
+            Unit: None
+            peak: None
+            pop: None
+            tips:
+          VALU IOPs:
             value: None # No perf counter
+            Unit: None
+            peak: None
+            pop: None
             tips:
-          mfma_flops_bf16_pop:
+          MFMA FLOPs (BF16):
             value: None # No perf counter
+            Unit: None
+            peak: None
+            pop: None
             tips:
-          mfma_flops_f16_pop:
+          MFMA FLOPs (F16):
             value: None # No perf counter
+            Unit: None
+            peak: None
+            pop: None
             tips:
-          mfma_flops_f32_pop:
+          MFMA FLOPs (F32):
             value: None # No perf counter
+            Unit: None
+            peak: None
+            pop: None
             tips:
-          mfma_flops_f64_pop:
+          MFMA FLOPs (F64):
             value: None # No perf counter
+            Unit: None
+            peak: None
+            pop: None
             tips:
-          mfma_flops_i8_pop:
+          MFMA IOPs (INT8):
             value: None # No perf counter
+            Unit: None
+            peak: None
+            pop: None
             tips:
 
     - metric_table:
@@ -51,36 +78,48 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          IPC (Avg):
+          IPC:
             avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
             min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
             max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
             unit: Instr/cycle
             tips: 
-          IPC (Issue):
-            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)
-              + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED)
+          IPC (Issued):
+            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) 
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
-            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)
-              + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED)
+            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) 
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
-            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)
-              + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED)
+            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) 
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             unit: Instr/cycle
             tips: 
-          SALU Util:
+          SALU Utilization:
             avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU))
             min: MIN((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU))
             max: MAX((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU))
             unit: pct
             tips: 
-          VALU Util:
+          VALU Utilization:
             avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU))
             min: MIN((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU))
             max: MAX((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU))
             unit: pct
             tips: 
+          VMEM Utilization:
+            avg: None # No HW module
+            min: None # No HW module
+            max: None # No HW module
+            unit: pct
+            tips: 
+          Branch Utilization:
+            avg: None # No HW module
+            min: None # No HW module
+            max: None # No HW module
+            unit: pct
+            tips: 
           VALU Active Threads:
             avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
               != 0) else None))
@@ -90,7 +129,7 @@ Panel Config:
               != 0) else None))
             unit: Threads
             tips: 
-          MFMA Util:
+          MFMA Utilization:
             avg: None # No HW module
             min: None # No HW module
             max: None # No HW module
@@ -102,6 +141,20 @@ Panel Config:
             max: None # No HW module
             unit: cycles/instr
             tips:
+          VMEM Latency:
+            avg: None # No perf counter
+            min: None # No perf counter
+            max: None # No perf counter
+            unit: Cycles
+            coll_level: SQ_INST_LEVEL_VMEM
+            tips:
+          SMEM Latency:
+            avg: None # No perf counter
+            min: None # No perf counter
+            max: None # No perf counter
+            unit: Cycles
+            coll_level: SQ_INST_LEVEL_SMEM
+            tips:
 
     - metric_table:
         id: 1103
@@ -120,7 +173,7 @@ Panel Config:
             max: None # No perf counter
             unit: (OPs  + $normUnit)
             tips:
-          INT8 OPs:
+          IOPs (Total):
             avg: None # No perf counter
             min: None # No perf counter
             max: None # No perf counter
@@ -150,41 +203,9 @@ Panel Config:
             max: None # No perf counter
             unit: (OPs  + $normUnit)
             tips:
-
-    - metric_table:
-        id: 1104
-        title: Memory Latencies
-        header:
-          metric: Metric
-          avg: Avg
-          min: Min
-          max: Max
-          unit: Unit
-          tips: Tips
-        metric:
-          VMEM Latency:
-            avg: None # No perf counter
-            min: None # No perf counter
-            max: None # No perf counter
-            unit: Cycles
-            tips: SQ_INSTS_LEVEL_VMEM
-          SMEM Latency:
-            avg: None # No perf counter
-            min: None # No perf counter
-            max: None # No perf counter
-            unit: Cycles
-            tips: SQ_INSTS_LEVEL_SMEM
-          Instr Fetch Latency:
-            avg: None # No perf counter
-            min: None # No perf counter
-            max: None # No perf counter
-            unit: Cycles
-            tips: SQ_IFETCH_LEVEL
-          LDS Latency:
+          INT8 OPs:
             avg: None # No perf counter
             min: None # No perf counter
             max: None # No perf counter
-            unit: Cycles
-            tips: SQ_INST_LEVEL_LDS
-
-
+            unit: (OPs  + $normUnit)
+            tips:
diff --git a/src/omniperf_analyze/configs/gfx906/1200_lds.yaml b/src/omniperf_analyze/configs/gfx906/1200_lds.yaml
index 3fd52c3b1..8e40452dc 100644
--- a/src/omniperf_analyze/configs/gfx906/1200_lds.yaml
+++ b/src/omniperf_analyze/configs/gfx906/1200_lds.yaml
@@ -26,20 +26,24 @@ Panel Config:
             value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: Pct of Peak
             tips: 
+            unit: pct
           Access Rate:
             value: AVG(((200 * SQ_ACTIVE_INST_LDS) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: Pct of Peak
             tips: 
-          Bandwidth (Pct-of-Peak):
+            unit: pct
+          Theoretical Bandwidth (% of Peak):
             value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks))
               / (EndNs - BeginNs)) / (($sclk * $numCU) * 0.00128)))
             unit: Pct of Peak
             tips: 
+            unit: pct
           Bank Conflict Rate:
             value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
               if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
             unit: Pct of Peak
             tips:
+            unit: pct
 
     - metric_table:
         id: 1202
@@ -58,7 +62,7 @@ Panel Config:
             max: MAX((SQ_INSTS_LDS / $denom))
             unit: (Instr  + $normUnit)
             tips: 
-          Bandwidth:
+          Theoretical Bandwidth:
             avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks))
               / $denom))
             min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks))
@@ -67,7 +71,14 @@ Panel Config:
               / $denom))
             unit: (Bytes  + $normUnit)
             tips: 
-          Bank Conficts/Access:
+          LDS Latency:
+            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
+            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
+            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
+            unit: Cycles
+            coll_level: SQ_INST_LEVEL_LDS
+            tips: 
+          Bank Conflicts/Access:
             avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
               if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
             min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
@@ -82,7 +93,7 @@ Panel Config:
             max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          Atomic Cycles:
+          Atomic Return Cycles:
             avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
             min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
             max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
@@ -110,12 +121,5 @@ Panel Config:
             avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
             min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
             max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
-            unit: ( + $normUnit)
-            tips: 
-          LDS Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_LDS
-            tips: 
+            unit: (Accesses + $normUnit)
+            tips: 
\ No newline at end of file
diff --git a/src/omniperf_analyze/configs/gfx906/1300_instruction-cache.yaml b/src/omniperf_analyze/configs/gfx906/1300_instruction-cache.yaml
index 05dc75980..555bc714a 100644
--- a/src/omniperf_analyze/configs/gfx906/1300_instruction-cache.yaml
+++ b/src/omniperf_analyze/configs/gfx906/1300_instruction-cache.yaml
@@ -13,7 +13,7 @@ Panel Config:
         title: Speed-of-Light
         header:
           metric: Metric
-          value: Value
+          value: Avg
           unit: Unit
           tips: Tips
         style:
@@ -27,11 +27,16 @@ Panel Config:
               * (EndNs - BeginNs))))
             unit: Pct of Peak
             tips: 
-          Cache Hit:
+          Cache Hit Rate:
             value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
               + SQC_ICACHE_MISSES_DUPLICATE)))
             unit: Pct of Peak
             tips: 
+          L1I-L2 Bandwidth:
+            value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($sclk * $numSQC) 
+              * (EndNs - BeginNs))))
+            unit: Pct of Peak
+            tips:
 
     - metric_table:
         id: 1302
@@ -68,7 +73,7 @@ Panel Config:
             max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
             unit: (Misses  + $normUnit)
             tips: 
-          Cache Hit:
+          Cache Hit Rate:
             avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
               + SQC_ICACHE_MISSES_DUPLICATE)))
             min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) +
@@ -77,3 +82,27 @@ Panel Config:
               SQC_ICACHE_MISSES_DUPLICATE)))
             unit: pct
             tips: 
+          Instruction Fetch Latency:
+            avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
+            min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
+            max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
+            unit: Cycles
+            coll_level: SQ_IFETCH_LEVEL
+            tips:
+    - metric_table:
+        id: 1303
+        title: Instruction Cache - L2 Interface
+        header:
+          metric: Metric
+          mean: Mean
+          min: Min
+          max: Max
+          unit: Unit
+          tips: Tips
+        metric:
+          L1I-L2 Bandwidth:
+            mean: AVG(((SQC_TC_INST_REQ * 64) / $denom))
+            min: MIN(((SQC_TC_INST_REQ * 64) / $denom))
+            max: MAX(((SQC_TC_INST_REQ * 64) / $denom))
+            unit: (Bytes + $normUnit)
+            tips:
\ No newline at end of file
diff --git a/src/omniperf_analyze/configs/gfx906/1400_constant-cache.yaml b/src/omniperf_analyze/configs/gfx906/1400_constant-cache.yaml
index 563caad13..0a2bc4b57 100644
--- a/src/omniperf_analyze/configs/gfx906/1400_constant-cache.yaml
+++ b/src/omniperf_analyze/configs/gfx906/1400_constant-cache.yaml
@@ -12,8 +12,8 @@ Panel Config:
         id: 1401
         title: Speed-of-Light
         header:
-          mertic: Metric
-          value: Value
+          metric: Metric
+          value: Avg
           unit: Unit
           tips: Tips
         style:
@@ -27,12 +27,17 @@ Panel Config:
               * (EndNs - BeginNs))))
             unit: Pct of Peak
             tips:
-          Cache Hit:
+          Cache Hit Rate:
             value:
               AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE))
               if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
             unit: Pct of Peak
             tips:
+          sL1D-L2 BW:
+            value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000)
+                        / (2 * ($sclk * $numSQC) * (EndNs - BeginNs)))
+            unit: Pct of Peak
+            tips:
 
     - metric_table:
         id: 1402
@@ -138,6 +143,12 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
+          sL1D-L2 BW:
+            mean: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom))
+            min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom))
+            max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom))
+            unit: (Bytes + $normUnit)
+            tips:
           Read Req:
             avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
             min: MIN((SQC_TC_DATA_READ_REQ / $denom))
diff --git a/src/omniperf_analyze/configs/gfx906/1500_TA_and_TD.yaml b/src/omniperf_analyze/configs/gfx906/1500_TA_and_TD.yaml
index 8f71cedc9..773bb7c76 100644
--- a/src/omniperf_analyze/configs/gfx906/1500_TA_and_TD.yaml
+++ b/src/omniperf_analyze/configs/gfx906/1500_TA_and_TD.yaml
@@ -6,11 +6,11 @@ Metric Description:
 # Define the panel properties and properties of each metric in the panel.
 Panel Config:
   id: 1500
-  title: Texture Addresser and Texture Data (TA/TD)
+  title: Address Processing Unit and Data Return Path (TA/TD)
   data source:
     - metric_table:
         id: 1501
-        title: TA
+        title: Address Processing Unit
         header:
           metric: Metric
           avg: Avg
@@ -19,25 +19,25 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          TA Busy:
+          Address Processing Unit Busy:
             avg: AVG(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             tips: 
-          TC2TA Addr Stall:
+          Address Stall:
             avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             tips: 
-          TC2TA Data Stall:
+          Data Stall:
             avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             tips: 
-          TD2TA Addr Stall:
+          Data-Processor → Address Stall:
             avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
@@ -47,69 +47,69 @@ Panel Config:
             avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
             min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
             max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Flat Instr:
+          Global/Generic Instructions:
             avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Flat Read Instr:
+          Global/Generic Read Instructions:
             avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Flat Write Instr:
+          Global/Generic Write Instructions:
             avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Flat Atomic Instr:
+          Global/Generic Atomic Instructions:
             avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Buffer Instr:
+          Spill/Stack Instructions:
             avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Buffer Read Instr:
+          Spill/Stack Read Instructions:
             avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Buffer Write Instr:
+          Spill/Stack Write Instructions:
             avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Buffer Atomic Instr:
+          Spill/Stack Atomic Instructions:
             avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Buffer Total Cylces:
+          Spill/Stack Total Cycles:
             avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          Buffer Coalesced Read:
+          Spill/Stack Coalesced Read:
             avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          Buffer Coalesced Write:
+          Spill/Stack Coalesced Write:
             avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
@@ -118,7 +118,7 @@ Panel Config:
 
     - metric_table:
         id: 1502
-        title: TD
+        title: Data-Return Path
         header:
           metric: Metric
           avg: Avg
@@ -127,48 +127,48 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          TD Busy:
+          Data-Return Busy:
             avg: AVG(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             tips: 
-          TC2TD Stall:
+          Cache RAM → Data-Return Stall:
             avg: AVG(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             tips: 
-          SPI2TD Stall:
+          Workgroup manager → Data-Return Stall:
             avg: # No perf counter
             min: # No perf counter
             max: # No perf counter
             unit: pct
             tips:
-          Coalescable Instr:
+          Coalescable Instructions:
             avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Load Instr:
+          Read Instructions:
             avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
             min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
             max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Store Instr:
+          Write Instructions:
             avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
             min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
             max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Atomic Instr:
+          Atomic Instructions:
             avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
             min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
             max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
diff --git a/src/omniperf_analyze/configs/gfx906/1600_L1_cache.yaml b/src/omniperf_analyze/configs/gfx906/1600_L1_cache.yaml
index 01e6d29d7..66f6a5e3d 100644
--- a/src/omniperf_analyze/configs/gfx906/1600_L1_cache.yaml
+++ b/src/omniperf_analyze/configs/gfx906/1600_L1_cache.yaml
@@ -13,7 +13,7 @@ Panel Config:
         title: Speed-of-Light
         header:
           metric: Metric
-          value: Value
+          value: Avg
           unit: Unit
           tips: Tips
         style:
@@ -22,26 +22,26 @@ Panel Config:
           label_txt: (%)
           xrange: [0, 110]
         metric:
-          Buffer Coalescing:
-            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
-              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
-            unit: Pct of Peak
-            tips: 
-          Cache Util:
-            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None))
+          Hit rate:
+            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
+              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
+              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
+              None))
             unit: Pct of Peak
             tips: 
-          Cache BW:
+          Bandwidth:
             value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))))
               / ((($sclk / 1000) * 64) * $numCU))
             unit: Pct of Peak
             tips: 
-          Cache Hit:
-            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
-              None))
+          Utilization:
+            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
+              != 0) else None))
+            unit: Pct of Peak
+            tips: 
+          Coalescing:
+            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
+              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
             unit: Pct of Peak
             tips: 
 
@@ -141,11 +141,26 @@ Panel Config:
             unit: (Req  + $normUnit)
             tips: 
           Cache BW:
-            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs)))
-            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs)))
-            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs)))
-            unit: GB/s
+            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom))
+            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom))
+            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom))
+            unit: (Bytes + $normUnit)
             tips:
+          Cache Hit Rate:
+            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
+              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
+              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
+              None))
+            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
+              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
+              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
+              None))
+            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
+              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
+              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
+              None))
+            unit: pct
+            tips: 
           Cache Accesses:
             avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
             min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
@@ -164,22 +179,7 @@ Panel Config:
               / $denom))
             unit: (Req  + $normUnit)
             tips: 
-          Cache Hit Rate:
-            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
-              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
-              None))
-            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
-              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
-              None))
-            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
-              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
-              None))
-            unit: pct
-            tips: 
-          Invalidate:
+          Invalidations:
             avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
             min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
             max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
@@ -188,9 +188,9 @@ Panel Config:
           L1-L2 BW:
             avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
               + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom))
-            min: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
+            min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
               + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom))
-            max: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
+            max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
               + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom))
             unit: (Bytes + $normUnit)
             tips:
@@ -388,17 +388,17 @@ Panel Config:
             avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
             min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
             max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            units: (Hits + $normUnit)
+            units: (Req + $normUnit)
             tips: 
-          Misses (Translation):
+          Translation Misses:
             avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
             min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
             max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            units: (Misses + $normUnit)
+            units: (Req + $normUnit)
             tips: 
-          Misses (Permission):
+          Permission Misses:
             avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
             min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
             max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            units: (Misses + $normUnit)
+            units: (Req + $normUnit)
             tips: 
diff --git a/src/omniperf_analyze/configs/gfx906/1700_L2_cache.yaml b/src/omniperf_analyze/configs/gfx906/1700_L2_cache.yaml
index 0b5f5e827..8cc5cf53b 100644
--- a/src/omniperf_analyze/configs/gfx906/1700_L2_cache.yaml
+++ b/src/omniperf_analyze/configs/gfx906/1700_L2_cache.yaml
@@ -13,31 +13,35 @@ Panel Config:
         title: Speed-of-Light
         header:
           metric: Metric
-          value: Value
+          value: Avg
           unit: Unit
           tips: Tips
         style:
           type: simple_bar
         metric:
-          L2 Util:
+          Utilization:
             value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($L2Banks) * GRBM_GUI_ACTIVE)))
             unit: pct
+            tips:
+          Bandwidth:
+            value: ((100 * AVG(((TCC_REQ_sum * 64) / (EndNs - BeginNs)))) / ((($sclk / 1000) * 64) * TO_INT($L2Banks)))
+            unit: pct
             tips: 
-          Cache Hit:
+          Hit Rate:
             value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
               + TCC_MISS_sum) != 0) else 0))
             unit: pct
-            tips: 
-          L2-EA Rd BW:
+            tips:
+          L2-Fabric Read BW:
             value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
               * 64)) / (EndNs - BeginNs)))
             unit: GB/s
-            tips: 
-          L2-EA Wr BW:
+            tips:
+          L2-Fabric Write and Atomic BW:
             value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
               * 32)) / (EndNs - BeginNs)))
             unit: GB/s
-            tips: 
+            tips:
 
     - metric_table:
         id: 1702
@@ -50,7 +54,7 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          Read BW:
+          L2-Fabric Read BW:
             avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
               * 64)) / $denom))
             min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
@@ -58,8 +62,26 @@ Panel Config:
             max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
               * 64)) / $denom))
             unit: (Bytes  + $normUnit)
-            tips: 
-          Write BW:
+            tips:
+          HBM Read Traffic:
+            avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          Remote Read Traffic:
+            avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          Uncached Read Traffic:
+            avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          L2-Fabric Write and Atomic BW:
             avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
               * 32)) / $denom))
             min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
@@ -67,55 +89,31 @@ Panel Config:
             max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
               * 32)) / $denom))
             unit: (Bytes  + $normUnit)
-            tips: 
-          Read (32B):
-            avg: AVG((TCC_EA_RDREQ_32B_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_32B_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_32B_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Read (Uncached 32B):
-            avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Read (64B):
-            avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-            min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-            max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          HBM Read:
-            avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Write (32B):
-            avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
-            min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
-            max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Write (Uncached 32B):
-            avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Write (64B):
-            avg: AVG((TCC_EA_WRREQ_64B_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_64B_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_64B_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          HBM Write:
-            avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
+            tips:
+          HBM Write and Atomic Traffic:
+            avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          Remote Write and Atomic Traffic:
+            avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          Atomic Traffic:
+            avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          Uncached Write and Atomic Traffic:
+            avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            unit: pct
+            tips:
           Read Latency:
             avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum !=
               0) else None))
@@ -124,7 +122,7 @@ Panel Config:
             max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum !=
               0) else None))
             unit: Cycles
-            tips: 
+            tips:
           Write Latency:
             avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum !=
               0) else None))
@@ -133,7 +131,7 @@ Panel Config:
             max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum !=
               0) else None))
             unit: Cycles
-            tips: 
+            tips:
           Atomic Latency:
             avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
               != 0) else None))
@@ -142,7 +140,7 @@ Panel Config:
             max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
               != 0) else None))
             unit: Cycles
-            tips: 
+            tips:
           Read Stall:
             avg: AVG((((100 * ((TCC_EA_RDREQ_IO_CREDIT_STALL_sum + TCC_EA_RDREQ_GMI_CREDIT_STALL_sum)
               + TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum !=
@@ -154,7 +152,7 @@ Panel Config:
               + TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum !=
               0) else None))
             unit: pct
-            tips: 
+            tips:
           Write Stall:
             avg: AVG((((100 * ((TCC_EA_WRREQ_IO_CREDIT_STALL_sum + TCC_EA_WRREQ_GMI_CREDIT_STALL_sum)
               + TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum !=
@@ -166,7 +164,7 @@ Panel Config:
               + TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum !=
               0) else None))
             unit: pct
-            tips: 
+            tips:
 
     - metric_table:
         id: 1703
@@ -179,121 +177,127 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
+          Bandwidth:
+            avg: AVG((TCC_REQ_sum * 64) / $denom)
+            min: MIN((TCC_REQ_sum * 64) / $denom)
+            max: MAX((TCC_REQ_sum * 64) / $denom)
+            unit: (Bytes + $normUnit)
+            tips: 
           Req:
             avg: AVG((TCC_REQ_sum / $denom))
             min: MIN((TCC_REQ_sum / $denom))
             max: MAX((TCC_REQ_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
-          Streaming Req:
-            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
-            min: MIN((TCC_STREAMING_REQ_sum / $denom))
-            max: MAX((TCC_STREAMING_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
+            tips:
           Read Req:
             avg: AVG((TCC_READ_sum / $denom))
             min: MIN((TCC_READ_sum / $denom))
             max: MAX((TCC_READ_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
           Write Req:
             avg: AVG((TCC_WRITE_sum / $denom))
             min: MIN((TCC_WRITE_sum / $denom))
             max: MAX((TCC_WRITE_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
           Atomic Req:
             avg: AVG((TCC_ATOMIC_sum / $denom))
             min: MIN((TCC_ATOMIC_sum / $denom))
             max: MAX((TCC_ATOMIC_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
+          Streaming Req:
+            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
+            min: MIN((TCC_STREAMING_REQ_sum / $denom))
+            max: MAX((TCC_STREAMING_REQ_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
           Probe Req:
             avg: AVG((TCC_PROBE_sum / $denom))
             min: MIN((TCC_PROBE_sum / $denom))
             max: MAX((TCC_PROBE_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
+          Cache Hit:
+            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+              + TCC_MISS_sum) != 0) else None))
+            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+              + TCC_MISS_sum) != 0) else None))
+            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+              + TCC_MISS_sum) != 0) else None))
+            unit: pct
+            tips:
           Hits:
             avg: AVG((TCC_HIT_sum / $denom))
             min: MIN((TCC_HIT_sum / $denom))
             max: MAX((TCC_HIT_sum / $denom))
             unit: (Hits  + $normUnit)
-            tips: 
+            tips:
           Misses:
             avg: AVG((TCC_MISS_sum / $denom))
             min: MIN((TCC_MISS_sum / $denom))
             max: MAX((TCC_MISS_sum / $denom))
             unit: (Misses  + $normUnit)
-            tips: 
-          Cache Hit:
-            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            unit: pct
-            tips: 
+            tips:
           Writeback:
             avg: AVG((TCC_WRITEBACK_sum / $denom))
             min: MIN((TCC_WRITEBACK_sum / $denom))
             max: MAX((TCC_WRITEBACK_sum / $denom))
-            unit: ( + $normUnit)
-            tips: 
+            unit: (Cachelines + $normUnit)
+            tips:
+          Writeback (Internal):
+            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
+            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
+            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
+            unit: (Cachelines + $normUnit)
+            tips:
+          Writeback (vL1D Req):
+            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
+            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
+            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
+            unit: (Cachelines + $normUnit)
+            tips:
+          Evict (Normal):
+            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
+            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
+            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
+            unit: (Cachelines + $normUnit)
+            tips:
+          Evict (vL1D Req):
+            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
+            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
+            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
+            unit: (Cachelines + $normUnit)
+            tips:
           NC Req:
             avg: AVG((TCC_NC_REQ_sum / $denom))
             min: MIN((TCC_NC_REQ_sum / $denom))
             max: MAX((TCC_NC_REQ_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
           UC Req:
             avg: AVG((TCC_UC_REQ_sum / $denom))
             min: MIN((TCC_UC_REQ_sum / $denom))
             max: MAX((TCC_UC_REQ_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
           CC Req:
             avg: AVG((TCC_CC_REQ_sum / $denom))
             min: MIN((TCC_CC_REQ_sum / $denom))
             max: MAX((TCC_CC_REQ_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
           RW Req:
             avg: None # No HW module
             min: None # No HW module
             max: None # No HW module
             unit: (Req  + $normUnit)
-            tips: 
-          Writeback (Normal):
-            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
-            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
-            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
-            unit: ( + $normUnit)
-            tips: 
-          Writeback (TC Req):
-            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            unit: ( + $normUnit)
-            tips: 
-          Evict (Normal):
-            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
-            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
-            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
-            unit: ( + $normUnit)
-            tips: 
-          Evict (TC Req):
-            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            unit: ( + $normUnit)
-            tips: 
+            tips:
 
     - metric_table:
         id: 1704
-        title: L2 - EA Interface Stalls
+        title: L2 - Fabric Interface Stalls
         header:
           metric: Metric
           type: Type
@@ -306,59 +310,137 @@ Panel Config:
         style:
           type: simple_multi_bar
         metric:
-          Read - Remote Socket Stall:
-            type: Remote Socket Stall
+          Read - PCIe Stall:
+            type: PCIe Stall
             transaction: Read
-            avg: AVG((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Read - Peer GCD Stall:
-            type: Peer GCD Stall
+            avg: AVG(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
+          Read - Infinity Fabric™ Stall:
+            type: Infinity Fabric™ Stall
             transaction: Read
-            avg: AVG((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
+            avg: AVG(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
           Read - HBM Stall:
             type: HBM Stall
             transaction: Read
-            avg: AVG((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Write - Remote Socket Stall:
-            type: Remote Socket Stall
+            avg: AVG(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
+          Write - PCIe Stall:
+            type: PCIe Stall
             transaction: Write
-            avg: AVG((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Write - Peer GCD Stall:
-            type: Peer GCD Stall
+            avg: AVG(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
+          Write - Infinity Fabric™ Stall:
+            type: Infinity Fabric™ Stall
             transaction: Write
-            avg: AVG((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
+            avg: AVG(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
           Write - HBM Stall:
             type: HBM Stall
             transaction: Write
-            avg: AVG((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
+            avg: AVG(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
           Write - Credit Starvation:
             type: Credit Starvation
             transaction: Write
-            avg: AVG((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom))
-            min: MIN((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom))
-            max: MAX((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom))
+            avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
+
+    - metric_table:
+        id: 1705
+        title: L2 - Fabric Detailed Transaction Breakdown
+        header:
+          metric: Metric
+          avg: Avg
+          min: Min
+          max: Max
+          unit: Unit
+          tips: Tips
+        metric:
+          Read (32B):
+            avg: AVG((TCC_EA_RDREQ_32B_sum / $denom))
+            min: MIN((TCC_EA_RDREQ_32B_sum / $denom))
+            max: MAX((TCC_EA_RDREQ_32B_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
+          Read (Uncached):
+            avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom))
+            min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom))
+            max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Read (64B):
+            avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
+            min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
+            max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          HBM Read:
+            avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom))
+            min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom))
+            max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Remote Read:
+            avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
+            min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
+            max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Write and Atomic (32B):
+            avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
+            min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
+            max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Write and Atomic (Uncached):
+            avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom))
+            min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom))
+            max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Write and Atomic (64B):
+            avg: AVG((TCC_EA_WRREQ_64B_sum / $denom))
+            min: MIN((TCC_EA_WRREQ_64B_sum / $denom))
+            max: MAX((TCC_EA_WRREQ_64B_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          HBM Write and Atomic:
+            avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom))
+            min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom))
+            max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Remote Write and Atomic:
+            avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
+            min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
+            max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Atomic:
+            avg: AVG((TCC_EA_ATOMIC_sum / $denom))
+            min: MIN((TCC_EA_ATOMIC_sum / $denom))
+            max: MAX((TCC_EA_ATOMIC_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
\ No newline at end of file
diff --git a/src/omniperf_analyze/configs/gfx906/1800_L2_cache_per_channel.yaml b/src/omniperf_analyze/configs/gfx906/1800_L2_cache_per_channel.yaml
index 7a808c5b8..c7d1851e7 100644
--- a/src/omniperf_analyze/configs/gfx906/1800_L2_cache_per_channel.yaml
+++ b/src/omniperf_analyze/configs/gfx906/1800_L2_cache_per_channel.yaml
@@ -13,7 +13,7 @@ Panel Config:
         title: Aggregate Stats (All 32 channels)
         header:
           metric: Metric
-          avg: Mean
+          avg: Avg
           std dev: Std Dev
           min: Min
           max: Max
@@ -167,7 +167,7 @@ Panel Config:
               + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
             unit: pct
             tips: 
-          Req:
+          L2 Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_REQ[0]) + TO_INT(TCC_REQ[1]))
               + TO_INT(TCC_REQ[2])) + TO_INT(TCC_REQ[3])) + TO_INT(TCC_REQ[4])) + TO_INT(TCC_REQ[5]))
               + TO_INT(TCC_REQ[6])) + TO_INT(TCC_REQ[7])) + TO_INT(TCC_REQ[8])) + TO_INT(TCC_REQ[9]))
@@ -206,7 +206,7 @@ Panel Config:
               + TO_INT(TCC_REQ[30])) + TO_INT(TCC_REQ[31])) / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L1 - L2 Read Req:
+          L2 Read Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_READ[0]) + TO_INT(TCC_READ[1]))
               + TO_INT(TCC_READ[2])) + TO_INT(TCC_READ[3])) + TO_INT(TCC_READ[4])) + TO_INT(TCC_READ[5]))
               + TO_INT(TCC_READ[6])) + TO_INT(TCC_READ[7])) + TO_INT(TCC_READ[8])) + TO_INT(TCC_READ[9]))
@@ -249,7 +249,7 @@ Panel Config:
               + TO_INT(TCC_READ[31])) / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L1 - L2 Write Req:
+          L2 Write Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_WRITE[0]) + TO_INT(TCC_WRITE[1]))
               + TO_INT(TCC_WRITE[2])) + TO_INT(TCC_WRITE[3])) + TO_INT(TCC_WRITE[4])) +
               TO_INT(TCC_WRITE[5])) + TO_INT(TCC_WRITE[6])) + TO_INT(TCC_WRITE[7])) + TO_INT(TCC_WRITE[8]))
@@ -296,7 +296,7 @@ Panel Config:
               + TO_INT(TCC_WRITE[30])) + TO_INT(TCC_WRITE[31])) / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L1 - L2 Atomic Req:
+          L2 Atomic Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_ATOMIC[0]) + TO_INT(TCC_ATOMIC[1]))
               + TO_INT(TCC_ATOMIC[2])) + TO_INT(TCC_ATOMIC[3])) + TO_INT(TCC_ATOMIC[4]))
               + TO_INT(TCC_ATOMIC[5])) + TO_INT(TCC_ATOMIC[6])) + TO_INT(TCC_ATOMIC[7]))
@@ -347,7 +347,7 @@ Panel Config:
               / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L2 - EA Read Req:
+          L2 - Fabric Read Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ[0]) + TO_INT(TCC_EA_RDREQ[1]))
               + TO_INT(TCC_EA_RDREQ[2])) + TO_INT(TCC_EA_RDREQ[3])) + TO_INT(TCC_EA_RDREQ[4]))
               + TO_INT(TCC_EA_RDREQ[5])) + TO_INT(TCC_EA_RDREQ[6])) + TO_INT(TCC_EA_RDREQ[7]))
@@ -398,7 +398,7 @@ Panel Config:
               / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L2 - EA Write Req:
+          L2 - Fabric Write Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ[0]) + TO_INT(TCC_EA_WRREQ[1]))
               + TO_INT(TCC_EA_WRREQ[2])) + TO_INT(TCC_EA_WRREQ[3])) + TO_INT(TCC_EA_WRREQ[4]))
               + TO_INT(TCC_EA_WRREQ[5])) + TO_INT(TCC_EA_WRREQ[6])) + TO_INT(TCC_EA_WRREQ[7]))
@@ -449,7 +449,7 @@ Panel Config:
               / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L2 - EA Atomic Req:
+          L2 - Fabric Atomic Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_ATOMIC[0]) + TO_INT(TCC_EA_ATOMIC[1]))
               + TO_INT(TCC_EA_ATOMIC[2])) + TO_INT(TCC_EA_ATOMIC[3])) + TO_INT(TCC_EA_ATOMIC[4]))
               + TO_INT(TCC_EA_ATOMIC[5])) + TO_INT(TCC_EA_ATOMIC[6])) + TO_INT(TCC_EA_ATOMIC[7]))
@@ -500,7 +500,7 @@ Panel Config:
               / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L2 - EA Read Lat:
+          L2 - Fabric Read Lat:
             avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_RDREQ_LEVEL[0] + TCC_EA_RDREQ_LEVEL[1])
               + TCC_EA_RDREQ_LEVEL[2]) + TCC_EA_RDREQ_LEVEL[3]) + TCC_EA_RDREQ_LEVEL[4])
               + TCC_EA_RDREQ_LEVEL[5]) + TCC_EA_RDREQ_LEVEL[6]) + TCC_EA_RDREQ_LEVEL[7])
@@ -615,7 +615,7 @@ Panel Config:
               + TCC_EA_RDREQ[29]) + TCC_EA_RDREQ[30]) + TCC_EA_RDREQ[31]) != 0) else None))
             unit: Cycles
             tips: 
-          L2 - EA Write Lat:
+          L2 - Fabric Write Lat:
             avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_WRREQ_LEVEL[0] + TCC_EA_WRREQ_LEVEL[1])
               + TCC_EA_WRREQ_LEVEL[2]) + TCC_EA_WRREQ_LEVEL[3]) + TCC_EA_WRREQ_LEVEL[4])
               + TCC_EA_WRREQ_LEVEL[5]) + TCC_EA_WRREQ_LEVEL[6]) + TCC_EA_WRREQ_LEVEL[7])
@@ -730,7 +730,7 @@ Panel Config:
               + TCC_EA_WRREQ[29]) + TCC_EA_WRREQ[30]) + TCC_EA_WRREQ[31]) != 0) else None))
             unit: Cycles
             tips: 
-          L2 - EA Atomic Lat:
+          L2 - Fabric Atomic Lat:
             avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_ATOMIC_LEVEL[0] + TCC_EA_ATOMIC_LEVEL[1])
               + TCC_EA_ATOMIC_LEVEL[2]) + TCC_EA_ATOMIC_LEVEL[3]) + TCC_EA_ATOMIC_LEVEL[4])
               + TCC_EA_ATOMIC_LEVEL[5]) + TCC_EA_ATOMIC_LEVEL[6]) + TCC_EA_ATOMIC_LEVEL[7])
@@ -849,49 +849,49 @@ Panel Config:
               None))
             unit: Cycles
             tips: 
-          L2 - EA Read Stall (IO):
+          L2 - Fabric Read Stall (PCIe):
             avg: None # No perf counter
             std dev: None # No perf counter
             min: None # No perf counter
             max: None # No perf counter
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Read Stall (GMI):
+          L2 - Fabric Read Stall (Infinity Fabric™):
             avg: None # No perf counter
             std dev: None # No perf counter
             min: None # No perf counter
             max: None # No perf counter
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Read Stall (DRAM):
+          L2 - Fabric Read Stall (HBM):
             avg: None # No perf counter
             std dev: None # No perf counter
             min: None # No perf counter
             max: None # No perf counter
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Write Stall (IO):
+          L2 - Fabric Write Stall (PCIe):
             avg: None # No perf counter
             std dev: None # No perf counter
             min: None # No perf counter
             max: None # No perf counter
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Write Stall (GMI):
+          L2 - Fabric Write Stall (Infinity Fabric™):
             avg: None # No perf counter
             std dev: None # No perf counter
             min: None # No perf counter
             max: None # No perf counter
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Write Stall (DRAM):
+          L2 - Fabric Write Stall (HBM):
             avg: None # No perf counter
             std dev: None # No perf counter
             min: None # No perf counter
             max: None # No perf counter
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Write Starve:
+          L2 - Fabric Write Starve:
             avg: None # No perf counter
             std dev: None # No perf counter
             min: None # No perf counter
@@ -906,22 +906,22 @@ Panel Config:
           channel: Channel
           hit rate: L2 Cache Hit Rate (%)
           req: Requests (Requests)
-          read req: L1-L2 Read (Requests)
-          write req: L1-L2 Write (Requests)
-          atomic req: L1-L2 Atomic (Requests)
-          ea read req: L2-EA Read (Requests)
-          ea write req: L2-EA Write (Requests)
-          ea atomic req: L2-EA Atomic (Requests)
-          ea read lat - cycles: L2-EA Read Latency (Cycles)
-          ea write lat - cycles: L2-EA Write Latency (Cycles)
-          ea atomic lat - cycles: L2-EA Atomic Latency (Cycles)
-          ea read stall - io: L2-EA Read Stall - IO (Cycles per)
-          ea read stall - gmi: L2-EA Read Stall - GMI (Cycles per)
-          ea read stall - dram: L2-EA Read Stall - DRAM (Cycles per)
-          ea write stall - io: L2-EA Write Stall - IO (Cycles per)
-          ea write stall - gmi: L2-EA Write Stall - GMI (Cycles per)
-          ea write stall - dram: L2-EA Write Stall - DRAM (Cycles per)
-          ea write stall - starve: L2-EA Write Stall - Starve (Cycles per)
+          read req: L2 Read (Requests)
+          write req: L2 Write (Requests)
+          atomic req: L2 Atomic (Requests)
+          ea read req: L2-Fabric Read (Requests)
+          ea write req: L2-Fabric Write and Atomic (Requests)
+          ea atomic req: L2-Fabric Atomic (Requests)
+          ea read lat - cycles: L2-Fabric Read Latency (Cycles)
+          ea write lat - cycles: L2-Fabric Write Latency (Cycles)
+          ea atomic lat - cycles: L2-Fabric Atomic Latency (Cycles)
+          ea read stall - io: L2-Fabric Read Stall - PCIe (Cycles per)
+          ea read stall - gmi: L2-Fabric Read Stall - Infinity Fabric™ (Cycles per)
+          ea read stall - dram: L2-Fabric Read Stall - HBM (Cycles per)
+          ea write stall - io: L2-Fabric Write Stall - PCIe (Cycles per)
+          ea write stall - gmi: L2-Fabric Write Stall - Infinity Fabric™ (Cycles per)
+          ea write stall - dram: L2-Fabric Write Stall - HBM (Cycles per)
+          ea write stall - starve: L2-Fabric Write Stall - Starve (Cycles per)
           tips: Tips
         metric:
           "0":
@@ -1381,22 +1381,22 @@ Panel Config:
           channel: Channel
           hit rate: L2 Cache Hit Rate (%)
           req: Requests (Requests)
-          read req: L1-L2 Read (Requests)
-          write req: L1-L2 Write (Requests)
-          atomic req: L1-L2 Atomic (Requests)
-          ea read req: L2-EA Read (Requests)
-          ea write req: L2-EA Write (Requests)
-          ea atomic req: L2-EA Atomic (Requests)
-          ea read lat - cycles: L2-EA Read Latency (Cycles)
-          ea write lat - cycles: L2-EA Write Latency (Cycles)
-          ea atomic lat - cycles: L2-EA Atomic Latency (Cycles)
-          ea read stall - io: L2-EA Read Stall - IO (Cycles per)
-          ea read stall - gmi: L2-EA Read Stall - GMI (Cycles per)
-          ea read stall - dram: L2-EA Read Stall - DRAM (Cycles per)
-          ea write stall - io: L2-EA Write Stall - IO (Cycles per)
-          ea write stall - gmi: L2-EA Write Stall - GMI (Cycles per)
-          ea write stall - dram: L2-EA Write Stall - DRAM (Cycles per)
-          ea write stall - starve: L2-EA Write Stall - Starve (Cycles per)
+          read req: L2 Read (Requests)
+          write req: L2 Write (Requests)
+          atomic req: L2 Atomic (Requests)
+          ea read req: L2-Fabric Read (Requests)
+          ea write req: L2-Fabric Write and Atomic (Requests)
+          ea atomic req: L2-Fabric Atomic (Requests)
+          ea read lat - cycles: L2-Fabric Read Latency (Cycles)
+          ea write lat - cycles: L2-Fabric Write Latency (Cycles)
+          ea atomic lat - cycles: L2-Fabric Atomic Latency (Cycles)
+          ea read stall - io: L2-Fabric Read Stall - PCIe (Cycles per)
+          ea read stall - gmi: L2-Fabric Read Stall - Infinity Fabric™ (Cycles per)
+          ea read stall - dram: L2-Fabric Read Stall - HBM (Cycles per)
+          ea write stall - io: L2-Fabric Write Stall - PCIe (Cycles per)
+          ea write stall - gmi: L2-Fabric Write Stall - Infinity Fabric™ (Cycles per)
+          ea write stall - dram: L2-Fabric Write Stall - HBM (Cycles per)
+          ea write stall - starve: L2-Fabric Write Stall - Starve (Cycles per)
           tips: Tips
         metric:
           "16":
diff --git a/src/omniperf_analyze/configs/gfx908/0200_system-speed-of-light.yaml b/src/omniperf_analyze/configs/gfx908/0200_system-speed-of-light.yaml
index 986b2f0ae..bc9dea77f 100644
--- a/src/omniperf_analyze/configs/gfx908/0200_system-speed-of-light.yaml
+++ b/src/omniperf_analyze/configs/gfx908/0200_system-speed-of-light.yaml
@@ -14,10 +14,10 @@ Panel Config:
         title: Speed-of-Light
         header:
           metric: Metric
-          value: Value
+          value: Avg
           unit: Unit
           peak: Peak
-          pop: PoP
+          pop: Pct of Peak
           tips: Tips
         metric:
           VALU FLOPs:
@@ -28,7 +28,7 @@ Panel Config:
             tips:
           VALU IOPs:
             value: None # No perf counter
-            unit: GOPs
+            unit: GIOPs
             peak: (((($sclk * $numCU) * 64) * 2) / 1000)
             pop: None # No perf counter
             tips:
@@ -68,25 +68,37 @@ Panel Config:
             peak: $numCU
             pop: ((100 * $numActiveCUs) / $numCU)
             tips: 
-          SALU Util:
+          SALU Utililization:
             value: AVG(((100 * SQ_ACTIVE_INST_SCA) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             peak: 100
             pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / (GRBM_GUI_ACTIVE * $numCU)))
             tips: 
-          VALU Util:
+          VALU Utililization:
             value: AVG(((100 * SQ_ACTIVE_INST_VALU) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             peak: 100
             pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / (GRBM_GUI_ACTIVE * $numCU)))
             tips:
-          MFMA Util:
+          MFMA Utililization:
             value: None # No HW module
             unit: pct
             peak: 100
             pop: None # No HW module
             tips:
-          VALU Active Threads/Wave:
+          VMEM Utilization:
+            value: None # No HW module
+            unit: pct
+            peak: 100
+            pop: None # No HW module
+            tips: 
+          Branch Utilization:
+            value: None # No HW module
+            unit: pct
+            peak: 100
+            pop: None # No HW module
+            tips: 
+          VALU Active Threads:
             value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
               != 0) else None))
             unit: Threads
@@ -94,25 +106,29 @@ Panel Config:
             pop: (AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
               != 0) else None)) * 1.5625)
             tips: 
-          IPC - Issue:
-            value: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)
-              + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED)
-              / SQ_ACTIVE_INST_ANY))
+          IPC:
+            value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
             unit: Instr/cycle
             peak: 5
-            pop: ((100 * AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)
-              + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED)
-              / SQ_ACTIVE_INST_ANY))) / 5)
+            pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
             tips: 
-          LDS BW:
+          Wavefront Occupancy:
+            value: AVG((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE))
+            unit: Wavefronts
+            peak: ($maxWavesPerCU * $numCU)
+            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE) / ($maxWavesPerCU
+              * $numCU))))
+            coll_level: SQ_LEVEL_WAVES
+            tips: 
+          Theoretical LDS Bandwidth:
             value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks))
               / (EndNs - BeginNs)))
-            unit: GB/sec
+            unit: GB/s
             peak: (($sclk * $numCU) * 0.128)
             pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks))
               / (EndNs - BeginNs)) / (($sclk * $numCU) * 0.00128)))
             tips: 
-          LDS Bank Conflict:
+          LDS Bank Conflicts/Access:
             value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
               if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
             unit: Conflicts/access
@@ -120,35 +136,7 @@ Panel Config:
             pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
               if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32)
             tips: 
-          Instr Cache Hit Rate:
-            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-            tips: 
-          Instr Cache BW:
-            value: AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64))
-            unit: GB/s
-            peak: ((($sclk / 1000) * 64) * $numSQC)
-            pop: ((100 * AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk
-              / 1000) * 64) * $numSQC))
-            tips: 
-          Scalar L1D Cache Hit Rate:
-            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-            tips: 
-          Scalar L1D Cache BW:
-            value: AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))
-            unit: GB/s
-            peak: ((($sclk / 1000) * 64) * $numSQC)
-            pop: ((100 * AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk
-              / 1000) * 64) * $numSQC))
-            tips: 
-          Vector L1D Cache Hit Rate:
+          vL1D Cache Hit Rate:
             value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
               + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
               / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
@@ -160,7 +148,7 @@ Panel Config:
               TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
               None))
             tips: 
-          Vector L1D Cache BW:
+          vL1D Cache BW:
             value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs)))
             unit: GB/s
             peak: ((($sclk / 1000) * 64) * $numCU)
@@ -175,6 +163,13 @@ Panel Config:
             pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
               + TCC_MISS_sum) != 0) else None))
             tips: 
+          L2 Cache BW:
+            value: AVG(((TCC_REQ_sum * 64) / (EndNs - BeginNs)))
+            unit: GB/s
+            peak: ((($sclk / 1000) * 64) * TO_INT($L2Banks))
+            pop: ((100 * AVG(((TCC_REQ_sum * 64) / (EndNs - BeginNs))))
+              / ((($sclk / 1000) * 64) * TO_INT($L2Banks)))
+            tips: 
           L2-Fabric Read BW:
             value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
               * 64)) / (EndNs - BeginNs)))
@@ -195,36 +190,48 @@ Panel Config:
             value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
               != 0) else None))
             unit: Cycles
-            peak: ''
-            pop: ''
+            peak: None
+            pop: None
             tips: 
           L2-Fabric Write Latency:
             value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
               != 0) else None))
             unit: Cycles
-            peak: ''
-            pop: ''
+            peak: None
+            pop: None
             tips: 
-          Wave Occupancy:
-            value: AVG((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE))
-            unit: Wavefronts
-            peak: ($maxWavesPerCU * $numCU)
-            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE) / ($maxWavesPerCU
-              * $numCU))))
-            coll_level: SQ_LEVEL_WAVES
+          sL1D Cache Hit Rate:
+            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
+              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
+            unit: pct
+            peak: 100
+            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
+              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
             tips: 
-          Instr Fetch BW:
-            value: AVG(((SQ_IFETCH / (EndNs - BeginNs)) * 32))
+          sL1D Cache BW:
+            value: AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))
             unit: GB/s
-            peak: ((($sclk / 1000) * 32) * $numSQC)
-            pop: ((100 * AVG(((SQ_IFETCH / (EndNs - BeginNs)) * 32))) / ($numSQC
-              * (($sclk / 1000) * 32)))
-            coll_level: SQ_IFETCH_LEVEL
+            peak: ((($sclk / 1000) * 64) * $numSQC)
+            pop: ((100 * AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk
+              / 1000) * 64) * $numSQC))
+            tips: 
+          L1I Hit Rate:
+            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
+            unit: pct
+            peak: 100
+            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
             tips: 
-          Instr Fetch Latency:
+          L1I BW:
+            value: AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64))
+            unit: GB/s
+            peak: ((($sclk / 1000) * 64) * $numSQC)
+            pop: ((100 * AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk
+              / 1000) * 64) * $numSQC))
+            tips:
+          L1I Fetch Latency:
             value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
             unit: Cycles
-            peak: ''
-            pop: ''
+            peak: None
+            pop: None
             coll_level: SQ_IFETCH_LEVEL
             tips:
diff --git a/src/omniperf_analyze/configs/gfx908/0500_command-processor.yaml b/src/omniperf_analyze/configs/gfx908/0500_command-processor.yaml
index 525091879..edd42da6e 100644
--- a/src/omniperf_analyze/configs/gfx908/0500_command-processor.yaml
+++ b/src/omniperf_analyze/configs/gfx908/0500_command-processor.yaml
@@ -19,19 +19,7 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          GPU Busy Cycles:
-            avg: AVG(GRBM_GUI_ACTIVE)
-            min: MIN(GRBM_GUI_ACTIVE)
-            max: MAX(GRBM_GUI_ACTIVE)
-            unit: Cycles/Kernel
-            tips: 
-          CPF Busy:
-            avg: AVG(CPF_CPF_STAT_BUSY)
-            min: MIN(CPF_CPF_STAT_BUSY)
-            max: MAX(CPF_CPF_STAT_BUSY)
-            unit: Cycles/Kernel
-            tips: 
-          CPF Util:
+          CPF Utilization:
             avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
               if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
             min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
@@ -47,15 +35,9 @@ Panel Config:
               != 0) else None))
             max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
               != 0) else None))
-            unit: Cycles/Kernel
-            tips: 
-          L2Cache Intf Busy:
-            avg: AVG(CPF_CPF_TCIU_BUSY)
-            min: MIN(CPF_CPF_TCIU_BUSY)
-            max: MAX(CPF_CPF_TCIU_BUSY)
-            unit: Cycles/Kernel
+            unit: pct
             tips: 
-          L2Cache Intf Util:
+          CPF-L2 Utilization:
             avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
               if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
             min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
@@ -64,7 +46,7 @@ Panel Config:
               if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
             unit: pct
             tips: 
-          L2Cache Intf Stall:
+          CPF-L2 Stall:
             avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
               != 0) else None))
             min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
@@ -73,11 +55,14 @@ Panel Config:
               != 0) else None))
             unit: pct
             tips: 
-          UTCL1 Stall:
-            avg: AVG(CPF_CMP_UTCL1_STALL_ON_TRANSLATION)
-            min: MIN(CPF_CMP_UTCL1_STALL_ON_TRANSLATION)
-            max: MAX(CPF_CMP_UTCL1_STALL_ON_TRANSLATION)
-            unit: Cycles/Kernel
+          CPF-UTCL1 Stall:
+            avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
+              != 0) else None)
+            min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
+              != 0) else None)
+            max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
+              != 0) else None)
+            unit: pct
             tips:
 
     - metric_table:
@@ -91,19 +76,7 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          GPU Busy Cycles:
-            avg: AVG(GRBM_GUI_ACTIVE)
-            min: MIN(GRBM_GUI_ACTIVE)
-            max: MAX(GRBM_GUI_ACTIVE)
-            unit: Cycles
-            tips: 
-          CPC Busy Cycles:
-            avg: AVG(CPC_CPC_STAT_BUSY)
-            min: MIN(CPC_CPC_STAT_BUSY)
-            max: MAX(CPC_CPC_STAT_BUSY)
-            unit: Cycles
-            tips: 
-          CPC Util:
+          CPC Utilization:
             avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
               if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
             min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
@@ -112,12 +85,6 @@ Panel Config:
               if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
             unit: pct
             tips: 
-          CPC Stall Cycles:
-            avg: AVG(CPC_CPC_STAT_STALL)
-            min: MIN(CPC_CPC_STAT_STALL)
-            max: MAX(CPC_CPC_STAT_STALL)
-            unit: Cycles
-            tips: 
           CPC Stall Rate:
             avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
               != 0) else None))
@@ -127,28 +94,19 @@ Panel Config:
               != 0) else None))
             unit: pct
             tips: 
-          CPC Packet Decoding:
-            avg: AVG(CPC_ME1_BUSY_FOR_PACKET_DECODE)
-            min: MIN(CPC_ME1_BUSY_FOR_PACKET_DECODE)
-            max: MAX(CPC_ME1_BUSY_FOR_PACKET_DECODE)
-            unit: Cycles
-            tips: 
-          SPI Intf Busy Cycles:
-            avg: AVG(CPC_ME1_DC0_SPI_BUSY)
-            min: MIN(CPC_ME1_DC0_SPI_BUSY)
-            max: MAX(CPC_ME1_DC0_SPI_BUSY)
-            unit: Cycles
-            tips: 
-          SPI Intf Util:
-            avg: AVG((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
+          CPC Packet Decoding Utilization:
+            avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
+            min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
+            max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
             unit: pct
             tips: 
-          L2Cache Intf Util:
+          CPC-Workgroup Manager Utilization:
+            avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
+            min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
+            max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
+            unit: Pct
+            tips: 
+          CPC-L2 Utilization:
             avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
               if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
             min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
@@ -157,19 +115,16 @@ Panel Config:
               if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
             unit: pct
             tips: 
-          UTCL1 Stall Cycles:
-            avg: AVG(CPC_UTCL1_STALL_ON_TRANSLATION)
-            min: MIN(CPC_UTCL1_STALL_ON_TRANSLATION)
-            max: MAX(CPC_UTCL1_STALL_ON_TRANSLATION)
-            unit: Cycles
-            tips: 
-          UTCL2 Intf Busy Cycles:
-            avg: AVG(CPC_CPC_UTCL2IU_BUSY)
-            min: MIN(CPC_CPC_UTCL2IU_BUSY)
-            max: MAX(CPC_CPC_UTCL2IU_BUSY)
-            unit: Cycles
+          CPC-UTCL1 Stall:
+            avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
+              != 0) else None)
+            min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
+              != 0) else None)
+            max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
+              != 0) else None)
+            unit: pct
             tips: 
-          UTCL2 Intf Util:
+          CPC-UTCL2 Utilization:
             avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
               if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
             min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
diff --git a/src/omniperf_analyze/configs/gfx908/0600_shader-processor-input.yaml b/src/omniperf_analyze/configs/gfx908/0600_shader-processor-input.yaml
index 38b81ed4f..24d4036ec 100644
--- a/src/omniperf_analyze/configs/gfx908/0600_shader-processor-input.yaml
+++ b/src/omniperf_analyze/configs/gfx908/0600_shader-processor-input.yaml
@@ -6,11 +6,11 @@ Metric Description:
 # Define the panel properties and properties of each metric in the panel.
 Panel Config:
   id: 600
-  title: Shader Processor Input (SPI)
+  title: Workgroup Manager (SPI)
   data source:
     - metric_table:
         id: 601
-        title: SPI Stats
+        title: Workgroup Manager Utilizations
         header:
           metric: Metric
           avg: Avg
@@ -19,29 +19,35 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          GPU Busy:
-            avg: AVG(GRBM_GUI_ACTIVE)
-            min: MIN(GRBM_GUI_ACTIVE)
-            max: MAX(GRBM_GUI_ACTIVE)
-            unit: Cycles
-            tips: 
-          CS Busy:
-            avg: AVG(SPI_CSN_BUSY)
-            min: MIN(SPI_CSN_BUSY)
-            max: MAX(SPI_CSN_BUSY)
-            unit: Cycles
-            tips: 
-          SPI Busy:
-            avg: AVG(GRBM_SPI_BUSY)
-            min: MIN(GRBM_SPI_BUSY)
-            max: MAX(GRBM_SPI_BUSY)
-            unit: Cycles
-            tips: 
-          SQ Busy:
-            avg: AVG(SQ_BUSY_CYCLES)
-            min: MIN(SQ_BUSY_CYCLES)
-            max: MAX(SQ_BUSY_CYCLES)
-            unit: Cycles
+          Accelerator Utilization:
+            avg: AVG(100 * GRBM_GUI_ACTIVE / GRBM_COUNT)
+            min: MIN(100 * GRBM_GUI_ACTIVE / GRBM_COUNT)
+            max: MAX(100 * GRBM_GUI_ACTIVE / GRBM_COUNT)
+            unit: Pct
+            tips: 
+          Scheduler-Pipe Utilization:
+            avg: AVG(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE))
+            min: MIN(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE))
+            max: MAX(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE))
+            unit: Pct
+            tips: 
+          Workgroup Manager Utilization:
+            avg: AVG(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE)
+            min: MIN(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE)
+            max: MAX(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE)
+            unit: Pct
+            tips: 
+          Shader Engine Utilization:
+            avg: AVG(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE))
+            min: MIN(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE))
+            max: MAX(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE))
+            unit: Pct
+            tips: 
+          SIMD Utilization:
+            avg: AVG(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
             tips: 
           Dispatched Workgroups:
             avg: AVG(SPI_CSN_NUM_THREADGROUPS)
@@ -55,22 +61,27 @@ Panel Config:
             max: MAX(SPI_CSN_WAVE)
             unit: Wavefronts
             tips: 
-          Wave Alloc Failed:
-            avg: AVG(SPI_RA_REQ_NO_ALLOC)
-            min: MIN(SPI_RA_REQ_NO_ALLOC)
-            max: MAX(SPI_RA_REQ_NO_ALLOC)
-            unit: Cycles
-            tips: 
-          Wave Alloc Failed - CS:
-            avg: AVG(SPI_RA_REQ_NO_ALLOC_CSN)
-            min: MIN(SPI_RA_REQ_NO_ALLOC_CSN)
-            max: MAX(SPI_RA_REQ_NO_ALLOC_CSN)
-            unit: Cycles
+          VGPR Writes:
+            avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            unit: Cycles/wave
+            tips: 
+          SGPR Writes:
+            avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            unit: Cycles/wave
             tips:
-
     - metric_table:
         id: 602
-        title: SPI Resource Allocation
+        title: Workgroup Manager - Resource Allocation
         header:
           metric: Metric
           avg: Avg
@@ -79,96 +90,78 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          Wave request Failed (CS):
-            avg: AVG(SPI_RA_REQ_NO_ALLOC_CSN)
-            min: MIN(SPI_RA_REQ_NO_ALLOC_CSN)
-            max: MAX(SPI_RA_REQ_NO_ALLOC_CSN)
-            unit: Cycles
-            tips: 
-          CS Stall:
-            avg: AVG(SPI_RA_RES_STALL_CSN)
-            min: MIN(SPI_RA_RES_STALL_CSN)
-            max: MAX(SPI_RA_RES_STALL_CSN)
-            unit: Cycles
-            tips: 
-          CS Stall Rate:
-            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY !=
+          Not-scheduled Rate (Workgroup Manager):
+            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            unit: Pct
+            tips: 
+          Not-scheduled Rate (Scheduler-Pipe):
+            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            min: MIN((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            max: MAX((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            unit: Pct
+            tips: 
+          Scheduler-Pipe Stall Rate:
+            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
               0) else None))
-            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY !=
+            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
               0) else None))
-            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY !=
+            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
               0) else None))
-            unit: pct
+            unit: Pct
             tips: 
-          Scratch Stall:
-            avg: AVG(SPI_RA_TMP_STALL_CSN)
-            min: MIN(SPI_RA_TMP_STALL_CSN)
-            max: MAX(SPI_RA_TMP_STALL_CSN)
-            unit: Cycles
+          Scratch Stall Rate:
+            avg: AVG((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)
+            min: MIN((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)
+            max: MAX((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)
+            unit: Pct
             tips: 
           Insufficient SIMD Waveslots:
-            avg: AVG(SPI_RA_WAVE_SIMD_FULL_CSN)
-            min: MIN(SPI_RA_WAVE_SIMD_FULL_CSN)
-            max: MAX(SPI_RA_WAVE_SIMD_FULL_CSN)
-            unit: SIMD
+            avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
             tips: 
           Insufficient SIMD VGPRs:
-            avg: AVG(SPI_RA_VGPR_SIMD_FULL_CSN)
-            min: MIN(SPI_RA_VGPR_SIMD_FULL_CSN)
-            max: MAX(SPI_RA_VGPR_SIMD_FULL_CSN)
-            unit: SIMD
+            avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
             tips: 
           Insufficient SIMD SGPRs:
-            avg: AVG(SPI_RA_SGPR_SIMD_FULL_CSN)
-            min: MIN(SPI_RA_SGPR_SIMD_FULL_CSN)
-            max: MAX(SPI_RA_SGPR_SIMD_FULL_CSN)
-            unit: SIMD
+            avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
             tips: 
           Insufficient CU LDS:
-            avg: AVG(SPI_RA_LDS_CU_FULL_CSN)
-            min: MIN(SPI_RA_LDS_CU_FULL_CSN)
-            max: MAX(SPI_RA_LDS_CU_FULL_CSN)
-            unit: CU
-            tips: 
-          Insufficient CU Barries:
-            avg: AVG(SPI_RA_BAR_CU_FULL_CSN)
-            min: MIN(SPI_RA_BAR_CU_FULL_CSN)
-            max: MAX(SPI_RA_BAR_CU_FULL_CSN)
-            unit: CU
-            tips: 
-          Insufficient Bulky Resource:
-            avg: AVG(SPI_RA_BULKY_CU_FULL_CSN)
-            min: MIN(SPI_RA_BULKY_CU_FULL_CSN)
-            max: MAX(SPI_RA_BULKY_CU_FULL_CSN)
-            unit: CU
-            tips: 
-          Reach CU Threadgroups Limit:
-            avg: AVG(SPI_RA_TGLIM_CU_FULL_CSN)
-            min: MIN(SPI_RA_TGLIM_CU_FULL_CSN)
-            max: MAX(SPI_RA_TGLIM_CU_FULL_CSN)
-            unit: Cycles
-            tips: 
-          Reach CU Wave Limit:
-            avg: AVG(SPI_RA_WVLIM_STALL_CSN)
-            min: MIN(SPI_RA_WVLIM_STALL_CSN)
-            max: MAX(SPI_RA_WVLIM_STALL_CSN)
-            unit: Cycles
+            avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
+            tips: 
+          Insufficient CU Barriers:
+            avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
+            tips: 
+          Reached CU Workgroup Limit:
+            avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
+            tips: 
+          Reached CU Wavefront Limit:
+            avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
             tips: 
-          VGPR Writes:
-            avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            unit: Cycles/wave
-            tips: 
-          SGPR Writes:
-            avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            unit: Cycles/wave
-            tips:
diff --git a/src/omniperf_analyze/configs/gfx908/0700_wavefront-launch.yaml b/src/omniperf_analyze/configs/gfx908/0700_wavefront-launch.yaml
index 70141193e..abcaae418 100644
--- a/src/omniperf_analyze/configs/gfx908/0700_wavefront-launch.yaml
+++ b/src/omniperf_analyze/configs/gfx908/0700_wavefront-launch.yaml
@@ -77,7 +77,7 @@ Panel Config:
             avg: AVG(scr)
             min: MIN(scr)
             max: MAX(scr)
-            unit: Bytes
+            unit: Bytes/Workitem
             tips:
 
     - metric_table:
@@ -103,7 +103,7 @@ Panel Config:
             max: MAX(GRBM_GUI_ACTIVE)
             unit: Cycle
             tips: 
-          Instr/wavefront:
+          Instructions per wavefront:
             avg: AVG((SQ_INSTS / SQ_WAVES))
             min: MIN((SQ_INSTS / SQ_WAVES))
             max: MAX((SQ_INSTS / SQ_WAVES))
diff --git a/src/omniperf_analyze/configs/gfx908/1000_compute-unit-instruction-mix.yaml b/src/omniperf_analyze/configs/gfx908/1000_compute-unit-instruction-mix.yaml
index 9df6750f6..9aac87117 100644
--- a/src/omniperf_analyze/configs/gfx908/1000_compute-unit-instruction-mix.yaml
+++ b/src/omniperf_analyze/configs/gfx908/1000_compute-unit-instruction-mix.yaml
@@ -10,7 +10,7 @@ Panel Config:
   data source:
     - metric_table:
         id: 1001
-        title: Instruction Mix
+        title: Overall Instruction Mix
         header:
           metric: Metric
           avg: Avg
@@ -22,7 +22,7 @@ Panel Config:
           type: simple_bar
           label_txt: (# of instr + $normUnit)
         metric:
-          VALU - Vector:
+          VALU:
             avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
             min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
             max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
@@ -40,7 +40,7 @@ Panel Config:
             max: MAX((SQ_INSTS_LDS / $denom))
             unit: (instr + $normUnit)
             tips: 
-          VALU - MFMA:
+          MFMA:
             avg: None # No HW module
             min: None # No HW module
             max: None # No HW module
@@ -64,12 +64,6 @@ Panel Config:
             max: MAX((SQ_INSTS_BRANCH / $denom))
             unit: (instr + $normUnit)
             tips: 
-          GDS:
-            avg: AVG((SQ_INSTS_GDS / $denom))
-            min: MIN((SQ_INSTS_GDS / $denom))
-            max: MAX((SQ_INSTS_GDS / $denom))
-            unit: (instr + $normUnit)
-            tips: 
 
     - metric_table:
         id: 1002
@@ -103,7 +97,7 @@ Panel Config:
             max: None # No perf counter
             unit: (instr + $normUnit)
             tips:
-          F16-Mult:
+          F16-MUL:
             avg: None # No perf counter
             min: None # No perf counter
             max: None # No perf counter
@@ -127,7 +121,7 @@ Panel Config:
             max: None # No perf counter
             unit: (instr + $normUnit)
             tips:
-          F32-Mult:
+          F32-MUL:
             avg: None # No perf counter
             min: None # No perf counter
             max: None # No perf counter
@@ -151,7 +145,7 @@ Panel Config:
             max: None # No perf counter
             unit: (instr + $normUnit)
             tips:
-          F64-Mult:
+          F64-MUL:
             avg: None # No perf counter
             min: None # No perf counter
             max: None # No perf counter
@@ -180,62 +174,65 @@ Panel Config:
         id: 1003
         title: VMEM Instr Mix
         header:
-          type: Type
-          count: Count
-          tips: Tips
-        metric:
-          Buffer Instr:
-            count: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            tips: 
-          Buffer Read:
-            count: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            tips: 
-          Buffer Write:
-            count: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            tips: 
-          Buffer Atomic:
-            count: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            tips: 
-          Flat Instr:
-            count: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
-            tips: 
-          Flat Read:
-            count: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            tips: 
-          Flat Write:
-            count: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            tips: 
-          Flat Atomic:
-            count: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            tips: 
-
-    - metric_table:
-        id: 1004
-        title: MFMA Arithmetic Instr Mix
-        header:
-          type: Type
-          count: Count
+          metric: Metric
+          avg: Avg
+          min: Min
+          max: Max
+          unit: Unit
           tips: Tips
         metric:
-          MFMA-I8:
-            count: None # No HW module
+          Global/Generic Instr:
+            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
             tips:
-          MFMA-F16:
-            count: None # No HW module
+          Global/Generic Read:
+            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
             tips:
-          MFMA-BF16:
-            count: None # No HW module
+          Global/Generic Write:
+            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
             tips:
-          MFMA-F32:
-            count: None # No HW module
+          Global/Generic Atomic:
+            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
             tips:
-          MFMA-F64:
-            count: None # No HW module
+          Spill/Stack Instr:
+            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
+          Spill/Stack Read:
+            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
+          Spill/Stack Write:
+            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
+          Spill/Stack Atomic:
+            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
             tips:
 
     - metric_table:
-        id: 1104
-        title: Memory Latencies
+        id: 1004
+        title: MFMA Arithmetic Instr Mix
         header:
           metric: Metric
           avg: Avg
@@ -244,27 +241,33 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          VMEM Latency:
-            avg: None # No perf counter
-            min: None # No perf counter
-            max: None # No perf counter
-            unit: Cycles
-            tips: SQ_INSTS_LEVEL_VMEM
-          SMEM Latency:
-            avg: None # No perf counter
-            min: None # No perf counter
-            max: None # No perf counter
-            unit: Cycles
-            tips: SQ_INSTS_LEVEL_SMEM
-          Instr Fetch Latency:
-            avg: None # No perf counter
-            min: None # No perf counter
-            max: None # No perf counter
-            unit: Cycles
-            tips: SQ_IFETCH_LEVEL
-          LDS Latency:
-            avg: None # No perf counter
-            min: None # No perf counter
-            max: None # No perf counter
-            unit: Cycles
-            tips: SQ_INST_LEVEL_LDS
+          MFMA-I8:
+            avg: None # No HW module
+            min: None # No HW module
+            max: None # No HW module
+            unit: (instr + $normUnit)
+            tips: 
+          MFMA-F16:
+            avg: None # No HW module
+            min: None # No HW module
+            max: None # No HW module
+            unit: (instr + $normUnit)
+            tips: 
+          MFMA-BF16:
+            avg: None # No HW module
+            min: None # No HW module
+            max: None # No HW module
+            unit: (instr + $normUnit)
+            tips: 
+          MFMA-F32:
+            avg: None # No HW module
+            min: None # No HW module
+            max: None # No HW module
+            unit: (instr + $normUnit)
+            tips:
+          MFMA-F64:
+            avg: None # No HW module
+            min: None # No HW module
+            max: None # No HW module
+            unit: (instr + $normUnit)
+            tips:
diff --git a/src/omniperf_analyze/configs/gfx908/1100_compute-unit-compute-pipeline.yaml b/src/omniperf_analyze/configs/gfx908/1100_compute-unit-compute-pipeline.yaml
index 061311d62..8dfcef927 100644
--- a/src/omniperf_analyze/configs/gfx908/1100_compute-unit-compute-pipeline.yaml
+++ b/src/omniperf_analyze/configs/gfx908/1100_compute-unit-compute-pipeline.yaml
@@ -13,7 +13,10 @@ Panel Config:
         title: Speed-of-Light
         header:
           metric: Metric
-          value: Value
+          value: Avg
+          unit: Unit
+          peak: Peak
+          pop: Pct of Peak
           tips: Tips
         style:
           type: simple_bar
@@ -21,23 +24,47 @@ Panel Config:
           label_txt: (%)
           xrange: [0, 110]
         metric:
-          valu_flops_pop:
+          VALU FLOPs:
+            value: None # No perf counter
+            Unit: None
+            peak: None
+            pop: None
+            tips:
+          VALU IOPs:
             value: None # No perf counter
+            Unit: None
+            peak: None
+            pop: None
             tips:
-          mfma_flops_bf16_pop:
+          MFMA FLOPs (BF16):
             value: None # No perf counter
+            Unit: None
+            peak: None
+            pop: None
             tips:
-          mfma_flops_f16_pop:
+          MFMA FLOPs (F16):
             value: None # No perf counter
+            Unit: None
+            peak: None
+            pop: None
             tips:
-          mfma_flops_f32_pop:
+          MFMA FLOPs (F32):
             value: None # No perf counter
+            Unit: None
+            peak: None
+            pop: None
             tips:
-          mfma_flops_f64_pop:
+          MFMA FLOPs (F64):
             value: None # No perf counter
+            Unit: None
+            peak: None
+            pop: None
             tips:
-          mfma_flops_i8_pop:
+          MFMA IOPs (INT8):
             value: None # No perf counter
+            Unit: None
+            peak: None
+            pop: None
             tips:
 
     - metric_table:
@@ -51,36 +78,48 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          IPC (Avg):
+          IPC:
             avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
             min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
             max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
             unit: Instr/cycle
             tips: 
-          IPC (Issue):
-            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)
-              + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED)
+          IPC (Issued):
+            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) 
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
-            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)
-              + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED)
+            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) 
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
-            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)
-              + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED)
+            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) 
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             unit: Instr/cycle
             tips: 
-          SALU Util:
+          SALU Utilization:
             avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU))
             min: MIN((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU))
             max: MAX((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU))
             unit: pct
             tips: 
-          VALU Util:
+          VALU Utilization:
             avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU))
             min: MIN((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU))
             max: MAX((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU))
             unit: pct
             tips: 
+          VMEM Utilization:
+            avg: None # No HW module
+            min: None # No HW module
+            max: None # No HW module
+            unit: pct
+            tips: 
+          Branch Utilization:
+            avg: None # No HW module
+            min: None # No HW module
+            max: None # No HW module
+            unit: pct
+            tips: 
           VALU Active Threads:
             avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
               != 0) else None))
@@ -90,7 +129,7 @@ Panel Config:
               != 0) else None))
             unit: Threads
             tips: 
-          MFMA Util:
+          MFMA Utilization:
             avg: None # No HW module
             min: None # No HW module
             max: None # No HW module
@@ -102,6 +141,20 @@ Panel Config:
             max: None # No HW module
             unit: cycles/instr
             tips:
+          VMEM Latency:
+            avg: None # No perf counter
+            min: None # No perf counter
+            max: None # No perf counter
+            unit: Cycles
+            coll_level: SQ_INST_LEVEL_VMEM
+            tips:
+          SMEM Latency:
+            avg: None # No perf counter
+            min: None # No perf counter
+            max: None # No perf counter
+            unit: Cycles
+            coll_level: SQ_INST_LEVEL_SMEM
+            tips:
 
     - metric_table:
         id: 1103
@@ -121,7 +174,7 @@ Panel Config:
             max: None # No perf counter
             unit: (OPs  + $normUnit)
             tips:
-          INT8 OPs:
+          IOPs (Total):
             avg: None # No perf counter
             min: None # No perf counter
             max: None # No perf counter
@@ -151,5 +204,11 @@ Panel Config:
             max: None # No perf counter
             unit: (OPs  + $normUnit)
             tips:
+          INT8 OPs:
+            avg: None # No perf counter
+            min: None # No perf counter
+            max: None # No perf counter
+            unit: (OPs  + $normUnit)
+            tips:
 
     
diff --git a/src/omniperf_analyze/configs/gfx908/1200_lds.yaml b/src/omniperf_analyze/configs/gfx908/1200_lds.yaml
index 3fd52c3b1..1fda7461d 100644
--- a/src/omniperf_analyze/configs/gfx908/1200_lds.yaml
+++ b/src/omniperf_analyze/configs/gfx908/1200_lds.yaml
@@ -30,11 +30,13 @@ Panel Config:
             value: AVG(((200 * SQ_ACTIVE_INST_LDS) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: Pct of Peak
             tips: 
-          Bandwidth (Pct-of-Peak):
+            unit: pct
+          Theoretical Bandwidth:
             value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks))
               / (EndNs - BeginNs)) / (($sclk * $numCU) * 0.00128)))
             unit: Pct of Peak
             tips: 
+            unit: pct
           Bank Conflict Rate:
             value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
               if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
@@ -58,7 +60,7 @@ Panel Config:
             max: MAX((SQ_INSTS_LDS / $denom))
             unit: (Instr  + $normUnit)
             tips: 
-          Bandwidth:
+          Theoretical Bandwidth:
             avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks))
               / $denom))
             min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks))
@@ -67,7 +69,14 @@ Panel Config:
               / $denom))
             unit: (Bytes  + $normUnit)
             tips: 
-          Bank Conficts/Access:
+          LDS Latency:
+            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
+            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
+            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
+            unit: Cycles
+            coll_level: SQ_INST_LEVEL_LDS
+            tips: 
+          Bank Conflicts/Access:
             avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
               if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
             min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
@@ -82,7 +91,7 @@ Panel Config:
             max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          Atomic Cycles:
+          Atomic Return Cycles:
             avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
             min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
             max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
@@ -110,12 +119,5 @@ Panel Config:
             avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
             min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
             max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
-            unit: ( + $normUnit)
-            tips: 
-          LDS Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_LDS
-            tips: 
+            unit: (Accesses + $normUnit)
+            tips: 
\ No newline at end of file
diff --git a/src/omniperf_analyze/configs/gfx908/1300_instruction-cache.yaml b/src/omniperf_analyze/configs/gfx908/1300_instruction-cache.yaml
index 05dc75980..555bc714a 100644
--- a/src/omniperf_analyze/configs/gfx908/1300_instruction-cache.yaml
+++ b/src/omniperf_analyze/configs/gfx908/1300_instruction-cache.yaml
@@ -13,7 +13,7 @@ Panel Config:
         title: Speed-of-Light
         header:
           metric: Metric
-          value: Value
+          value: Avg
           unit: Unit
           tips: Tips
         style:
@@ -27,11 +27,16 @@ Panel Config:
               * (EndNs - BeginNs))))
             unit: Pct of Peak
             tips: 
-          Cache Hit:
+          Cache Hit Rate:
             value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
               + SQC_ICACHE_MISSES_DUPLICATE)))
             unit: Pct of Peak
             tips: 
+          L1I-L2 Bandwidth:
+            value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($sclk * $numSQC) 
+              * (EndNs - BeginNs))))
+            unit: Pct of Peak
+            tips:
 
     - metric_table:
         id: 1302
@@ -68,7 +73,7 @@ Panel Config:
             max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
             unit: (Misses  + $normUnit)
             tips: 
-          Cache Hit:
+          Cache Hit Rate:
             avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
               + SQC_ICACHE_MISSES_DUPLICATE)))
             min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) +
@@ -77,3 +82,27 @@ Panel Config:
               SQC_ICACHE_MISSES_DUPLICATE)))
             unit: pct
             tips: 
+          Instruction Fetch Latency:
+            avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
+            min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
+            max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
+            unit: Cycles
+            coll_level: SQ_IFETCH_LEVEL
+            tips:
+    - metric_table:
+        id: 1303
+        title: Instruction Cache - L2 Interface
+        header:
+          metric: Metric
+          mean: Mean
+          min: Min
+          max: Max
+          unit: Unit
+          tips: Tips
+        metric:
+          L1I-L2 Bandwidth:
+            mean: AVG(((SQC_TC_INST_REQ * 64) / $denom))
+            min: MIN(((SQC_TC_INST_REQ * 64) / $denom))
+            max: MAX(((SQC_TC_INST_REQ * 64) / $denom))
+            unit: (Bytes + $normUnit)
+            tips:
\ No newline at end of file
diff --git a/src/omniperf_analyze/configs/gfx908/1400_constant-cache.yaml b/src/omniperf_analyze/configs/gfx908/1400_constant-cache.yaml
index 563caad13..aa55fee0c 100644
--- a/src/omniperf_analyze/configs/gfx908/1400_constant-cache.yaml
+++ b/src/omniperf_analyze/configs/gfx908/1400_constant-cache.yaml
@@ -12,8 +12,8 @@ Panel Config:
         id: 1401
         title: Speed-of-Light
         header:
-          mertic: Metric
-          value: Value
+          metric: Metric
+          value: Avg
           unit: Unit
           tips: Tips
         style:
@@ -27,12 +27,17 @@ Panel Config:
               * (EndNs - BeginNs))))
             unit: Pct of Peak
             tips:
-          Cache Hit:
+          Cache Hit Rate:
             value:
               AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE))
               if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
             unit: Pct of Peak
             tips:
+          sL1D-L2 BW:
+            value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000)
+                        / (2 * ($sclk * $numSQC) * (EndNs - BeginNs)))
+            unit: Pct of Peak
+            tips:
 
     - metric_table:
         id: 1402
@@ -69,7 +74,7 @@ Panel Config:
             max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
             unit: (Req  + $normUnit)
             tips: 
-          Cache Hit:
+          Cache Hit Rate:
             avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
               + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
               + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
@@ -138,6 +143,12 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
+          sL1D-L2 BW:
+            mean: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom))
+            min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom))
+            max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom))
+            unit: (Bytes + $normUnit)
+            tips:
           Read Req:
             avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
             min: MIN((SQC_TC_DATA_READ_REQ / $denom))
@@ -156,7 +167,7 @@ Panel Config:
             max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
             unit: (Req  + $normUnit)
             tips: 
-          Stall:
+          Stall Cycles:
             avg: AVG((SQC_TC_STALL / $denom))
             min: MIN((SQC_TC_STALL / $denom))
             max: MAX((SQC_TC_STALL / $denom))
diff --git a/src/omniperf_analyze/configs/gfx908/1500_TA_and_TD.yaml b/src/omniperf_analyze/configs/gfx908/1500_TA_and_TD.yaml
index 8f71cedc9..773bb7c76 100644
--- a/src/omniperf_analyze/configs/gfx908/1500_TA_and_TD.yaml
+++ b/src/omniperf_analyze/configs/gfx908/1500_TA_and_TD.yaml
@@ -6,11 +6,11 @@ Metric Description:
 # Define the panel properties and properties of each metric in the panel.
 Panel Config:
   id: 1500
-  title: Texture Addresser and Texture Data (TA/TD)
+  title: Address Processing Unit and Data Return Path (TA/TD)
   data source:
     - metric_table:
         id: 1501
-        title: TA
+        title: Address Processing Unit
         header:
           metric: Metric
           avg: Avg
@@ -19,25 +19,25 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          TA Busy:
+          Address Processing Unit Busy:
             avg: AVG(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             tips: 
-          TC2TA Addr Stall:
+          Address Stall:
             avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             tips: 
-          TC2TA Data Stall:
+          Data Stall:
             avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             tips: 
-          TD2TA Addr Stall:
+          Data-Processor → Address Stall:
             avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
@@ -47,69 +47,69 @@ Panel Config:
             avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
             min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
             max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Flat Instr:
+          Global/Generic Instructions:
             avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Flat Read Instr:
+          Global/Generic Read Instructions:
             avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Flat Write Instr:
+          Global/Generic Write Instructions:
             avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Flat Atomic Instr:
+          Global/Generic Atomic Instructions:
             avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Buffer Instr:
+          Spill/Stack Instructions:
             avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Buffer Read Instr:
+          Spill/Stack Read Instructions:
             avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Buffer Write Instr:
+          Spill/Stack Write Instructions:
             avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Buffer Atomic Instr:
+          Spill/Stack Atomic Instructions:
             avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Buffer Total Cylces:
+          Spill/Stack Total Cycles:
             avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          Buffer Coalesced Read:
+          Spill/Stack Coalesced Read:
             avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          Buffer Coalesced Write:
+          Spill/Stack Coalesced Write:
             avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
@@ -118,7 +118,7 @@ Panel Config:
 
     - metric_table:
         id: 1502
-        title: TD
+        title: Data-Return Path
         header:
           metric: Metric
           avg: Avg
@@ -127,48 +127,48 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          TD Busy:
+          Data-Return Busy:
             avg: AVG(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             tips: 
-          TC2TD Stall:
+          Cache RAM → Data-Return Stall:
             avg: AVG(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             tips: 
-          SPI2TD Stall:
+          Workgroup manager → Data-Return Stall:
             avg: # No perf counter
             min: # No perf counter
             max: # No perf counter
             unit: pct
             tips:
-          Coalescable Instr:
+          Coalescable Instructions:
             avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Load Instr:
+          Read Instructions:
             avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
             min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
             max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Store Instr:
+          Write Instructions:
             avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
             min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
             max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Atomic Instr:
+          Atomic Instructions:
             avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
             min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
             max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
diff --git a/src/omniperf_analyze/configs/gfx908/1600_L1_cache.yaml b/src/omniperf_analyze/configs/gfx908/1600_L1_cache.yaml
index cac92b1f2..db6b688ab 100644
--- a/src/omniperf_analyze/configs/gfx908/1600_L1_cache.yaml
+++ b/src/omniperf_analyze/configs/gfx908/1600_L1_cache.yaml
@@ -13,7 +13,7 @@ Panel Config:
         title: Speed-of-Light
         header:
           metric: Metric
-          value: Value
+          value: Avg
           unit: Unit
           tips: Tips
         style:
@@ -22,26 +22,26 @@ Panel Config:
           label_txt: (%)
           xrange: [0, 110]
         metric:
-          Buffer Coalescing:
-            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
-              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
-            unit: Pct of Peak
-            tips: 
-          Cache Util:
-            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None))
+          Hit rate:
+            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
+              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
+              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
+              None))
             unit: Pct of Peak
             tips: 
-          Cache BW:
+          Bandwidth:
             value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))))
               / ((($sclk / 1000) * 64) * $numCU))
             unit: Pct of Peak
             tips: 
-          Cache Hit:
-            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
-              None))
+          Utilization:
+            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
+              != 0) else None))
+            unit: Pct of Peak
+            tips: 
+          Coalescing:
+            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
+              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
             unit: Pct of Peak
             tips: 
 
@@ -141,11 +141,26 @@ Panel Config:
             unit: (Req  + $normUnit)
             tips:
           Cache BW:
-            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs)))
-            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs)))
-            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs)))
-            unit: GB/s
+            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom))
+            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom))
+            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom))
+            unit: (Bytes + $normUnit)
             tips:
+          Cache Hit Rate:
+            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
+              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
+              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
+              None))
+            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
+              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
+              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
+              None))
+            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
+              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
+              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
+              None))
+            unit: pct
+            tips: 
           Cache Accesses:
             avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
             min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
@@ -164,22 +179,7 @@ Panel Config:
               / $denom))
             unit: (Req  + $normUnit)
             tips: 
-          Cache Hit Rate:
-            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
-              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
-              None))
-            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
-              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
-              None))
-            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
-              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
-              None))
-            unit: pct
-            tips: 
-          Invalidate:
+          Invalidations:
             avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
             min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
             max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
@@ -188,9 +188,9 @@ Panel Config:
           L1-L2 BW:
             avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
               + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom))
-            min: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
+            min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
               + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom))
-            max: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
+            max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
               + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom))
             unit: (Bytes + $normUnit)
             tips:
@@ -388,17 +388,17 @@ Panel Config:
             avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
             min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
             max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            units: (Hits + $normUnit)
+            units: (Req + $normUnit)
             tips: 
-          Misses (Translation):
+          Translation Misses:
             avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
             min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
             max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            units: (Misses + $normUnit)
+            units: (Req + $normUnit)
             tips: 
-          Misses (Permission):
+          Permission Misses:
             avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
             min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
             max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            units: (Misses + $normUnit)
+            units: (Req + $normUnit)
             tips: 
diff --git a/src/omniperf_analyze/configs/gfx908/1700_L2_cache.yaml b/src/omniperf_analyze/configs/gfx908/1700_L2_cache.yaml
index 0c7b03811..cf782e193 100644
--- a/src/omniperf_analyze/configs/gfx908/1700_L2_cache.yaml
+++ b/src/omniperf_analyze/configs/gfx908/1700_L2_cache.yaml
@@ -13,31 +13,35 @@ Panel Config:
         title: Speed-of-Light
         header:
           metric: Metric
-          value: Value
+          value: Avg
           unit: Unit
           tips: Tips
         style:
           type: simple_bar
         metric:
-          L2 Util:
+          Utilization:
             value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($L2Banks) * GRBM_GUI_ACTIVE)))
             unit: pct
+            tips:
+          Bandwidth:
+            value: ((100 * AVG(((TCC_REQ_sum * 64) / (EndNs - BeginNs)))) / ((($sclk / 1000) * 64) * TO_INT($L2Banks)))
+            unit: pct
             tips: 
-          Cache Hit:
+          Hit Rate:
             value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
               + TCC_MISS_sum) != 0) else 0))
             unit: pct
-            tips: 
-          L2-EA Rd BW:
+            tips:
+          L2-Fabric Read BW:
             value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
               * 64)) / (EndNs - BeginNs)))
             unit: GB/s
-            tips: 
-          L2-EA Wr BW:
+            tips:
+          L2-Fabric Write and Atomic BW:
             value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
               * 32)) / (EndNs - BeginNs)))
             unit: GB/s
-            tips: 
+            tips:
 
     - metric_table:
         id: 1702
@@ -50,7 +54,7 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          Read BW:
+          L2-Fabric Read BW:
             avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
               * 64)) / $denom))
             min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
@@ -58,8 +62,26 @@ Panel Config:
             max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
               * 64)) / $denom))
             unit: (Bytes  + $normUnit)
-            tips: 
-          Write BW:
+            tips:
+          HBM Read Traffic:
+            avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          Remote Read Traffic:
+            avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          Uncached Read Traffic:
+            avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          L2-Fabric Write and Atomic BW:
             avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
               * 32)) / $denom))
             min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
@@ -67,55 +89,31 @@ Panel Config:
             max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
               * 32)) / $denom))
             unit: (Bytes  + $normUnit)
-            tips: 
-          Read (32B):
-            avg: AVG((TCC_EA_RDREQ_32B_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_32B_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_32B_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Read (Uncached 32B):
-            avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Read (64B):
-            avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-            min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-            max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          HBM Read:
-            avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Write (32B):
-            avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
-            min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
-            max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Write (Uncached 32B):
-            avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Write (64B):
-            avg: AVG((TCC_EA_WRREQ_64B_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_64B_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_64B_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          HBM Write:
-            avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
+            tips:
+          HBM Write and Atomic Traffic:
+            avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          Remote Write and Atomic Traffic:
+            avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          Atomic Traffic:
+            avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          Uncached Write and Atomic Traffic:
+            avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            unit: pct
+            tips:
           Read Latency:
             avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum !=
               0) else None))
@@ -124,7 +122,7 @@ Panel Config:
             max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum !=
               0) else None))
             unit: Cycles
-            tips: 
+            tips:
           Write Latency:
             avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum !=
               0) else None))
@@ -133,7 +131,7 @@ Panel Config:
             max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum !=
               0) else None))
             unit: Cycles
-            tips: 
+            tips:
           Atomic Latency:
             avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
               != 0) else None))
@@ -142,7 +140,7 @@ Panel Config:
             max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
               != 0) else None))
             unit: Cycles
-            tips: 
+            tips:
           Read Stall:
             avg: AVG((((100 * ((TCC_EA_RDREQ_IO_CREDIT_STALL_sum + TCC_EA_RDREQ_GMI_CREDIT_STALL_sum)
               + TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum !=
@@ -154,7 +152,7 @@ Panel Config:
               + TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum !=
               0) else None))
             unit: pct
-            tips: 
+            tips:
           Write Stall:
             avg: AVG((((100 * ((TCC_EA_WRREQ_IO_CREDIT_STALL_sum + TCC_EA_WRREQ_GMI_CREDIT_STALL_sum)
               + TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum !=
@@ -166,7 +164,7 @@ Panel Config:
               + TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum !=
               0) else None))
             unit: pct
-            tips: 
+            tips:
 
     - metric_table:
         id: 1703
@@ -179,121 +177,127 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
+          Bandwidth:
+            avg: AVG((TCC_REQ_sum * 64) / $denom)
+            min: MIN((TCC_REQ_sum * 64) / $denom)
+            max: MAX((TCC_REQ_sum * 64) / $denom)
+            unit: (Bytes + $normUnit)
+            tips: 
           Req:
             avg: AVG((TCC_REQ_sum / $denom))
             min: MIN((TCC_REQ_sum / $denom))
             max: MAX((TCC_REQ_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
-          Streaming Req:
-            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
-            min: MIN((TCC_STREAMING_REQ_sum / $denom))
-            max: MAX((TCC_STREAMING_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
+            tips:
           Read Req:
             avg: AVG((TCC_READ_sum / $denom))
             min: MIN((TCC_READ_sum / $denom))
             max: MAX((TCC_READ_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
           Write Req:
             avg: AVG((TCC_WRITE_sum / $denom))
             min: MIN((TCC_WRITE_sum / $denom))
             max: MAX((TCC_WRITE_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
           Atomic Req:
             avg: AVG((TCC_ATOMIC_sum / $denom))
             min: MIN((TCC_ATOMIC_sum / $denom))
             max: MAX((TCC_ATOMIC_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
           Probe Req:
             avg: AVG((TCC_PROBE_sum / $denom))
             min: MIN((TCC_PROBE_sum / $denom))
             max: MAX((TCC_PROBE_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
+          Streaming Req:
+            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
+            min: MIN((TCC_STREAMING_REQ_sum / $denom))
+            max: MAX((TCC_STREAMING_REQ_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Cache Hit:
+            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+              + TCC_MISS_sum) != 0) else None))
+            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+              + TCC_MISS_sum) != 0) else None))
+            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+              + TCC_MISS_sum) != 0) else None))
+            unit: pct
+            tips:
           Hits:
             avg: AVG((TCC_HIT_sum / $denom))
             min: MIN((TCC_HIT_sum / $denom))
             max: MAX((TCC_HIT_sum / $denom))
             unit: (Hits  + $normUnit)
-            tips: 
+            tips:
           Misses:
             avg: AVG((TCC_MISS_sum / $denom))
             min: MIN((TCC_MISS_sum / $denom))
             max: MAX((TCC_MISS_sum / $denom))
             unit: (Misses  + $normUnit)
-            tips: 
-          Cache Hit:
-            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            unit: pct
-            tips: 
+            tips:
           Writeback:
             avg: AVG((TCC_WRITEBACK_sum / $denom))
             min: MIN((TCC_WRITEBACK_sum / $denom))
             max: MAX((TCC_WRITEBACK_sum / $denom))
-            unit: ( + $normUnit)
-            tips: 
+            unit: (Cachelines + $normUnit)
+            tips:
+          Writeback (Internal):
+            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
+            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
+            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
+            unit: (Cachelines + $normUnit)
+            tips:
+          Writeback (vL1D Req):
+            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
+            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
+            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
+            unit: (Cachelines + $normUnit)
+            tips:
+          Evict (Normal):
+            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
+            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
+            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
+            unit: (Cachelines + $normUnit)
+            tips:
+          Evict (vL1D Req):
+            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
+            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
+            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
+            unit: (Cachelines + $normUnit)
+            tips:
           NC Req:
             avg: AVG((TCC_NC_REQ_sum / $denom))
             min: MIN((TCC_NC_REQ_sum / $denom))
             max: MAX((TCC_NC_REQ_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
           UC Req:
             avg: AVG((TCC_UC_REQ_sum / $denom))
             min: MIN((TCC_UC_REQ_sum / $denom))
             max: MAX((TCC_UC_REQ_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
           CC Req:
             avg: AVG((TCC_CC_REQ_sum / $denom))
             min: MIN((TCC_CC_REQ_sum / $denom))
             max: MAX((TCC_CC_REQ_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
           RW Req:
             avg: AVG((TCC_RW_REQ_sum / $denom))
             min: MIN((TCC_RW_REQ_sum / $denom))
             max: MAX((TCC_RW_REQ_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
-          Writeback (Normal):
-            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
-            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
-            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
-            unit: ( + $normUnit)
-            tips: 
-          Writeback (TC Req):
-            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            unit: ( + $normUnit)
-            tips: 
-          Evict (Normal):
-            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
-            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
-            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
-            unit: ( + $normUnit)
-            tips: 
-          Evict (TC Req):
-            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            unit: ( + $normUnit)
-            tips: 
+            tips:
 
     - metric_table:
         id: 1704
-        title: L2 - EA Interface Stalls
+        title: L2 - Fabric Interface Stalls
         header:
           metric: Metric
           type: Type
@@ -306,59 +310,137 @@ Panel Config:
         style:
           type: simple_multi_bar
         metric:
-          Read - Remote Socket Stall:
-            type: Remote Socket Stall
+          Read - PCIe Stall:
+            type: PCIe Stall
             transaction: Read
-            avg: AVG((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Read - Peer GCD Stall:
-            type: Peer GCD Stall
+            avg: AVG(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
+          Read - Infinity Fabric™ Stall:
+            type: Infinity Fabric™ Stall
             transaction: Read
-            avg: AVG((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
+            avg: AVG(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
           Read - HBM Stall:
             type: HBM Stall
             transaction: Read
-            avg: AVG((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Write - Remote Socket Stall:
-            type: Remote Socket Stall
+            avg: AVG(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
+          Write - PCIe Stall:
+            type: PCIe Stall
             transaction: Write
-            avg: AVG((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Write - Peer GCD Stall:
-            type: Peer GCD Stall
+            avg: AVG(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
+          Write - Infinity Fabric™ Stall:
+            type: Infinity Fabric™ Stall
             transaction: Write
-            avg: AVG((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
+            avg: AVG(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
           Write - HBM Stall:
-            type: HBM Stall 
+            type: HBM Stall
             transaction: Write
-            avg: AVG((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
+            avg: AVG(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
           Write - Credit Starvation:
             type: Credit Starvation
             transaction: Write
-            avg: AVG((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom))
-            min: MIN((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom))
-            max: MAX((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom))
+            avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
+
+    - metric_table:
+        id: 1705
+        title: L2 - Fabric Detailed Transaction Breakdown
+        header:
+          metric: Metric
+          avg: Avg
+          min: Min
+          max: Max
+          unit: Unit
+          tips: Tips
+        metric:
+          Read (32B):
+            avg: AVG((TCC_EA_RDREQ_32B_sum / $denom))
+            min: MIN((TCC_EA_RDREQ_32B_sum / $denom))
+            max: MAX((TCC_EA_RDREQ_32B_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
+          Read (Uncached):
+            avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom))
+            min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom))
+            max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Read (64B):
+            avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
+            min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
+            max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          HBM Read:
+            avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom))
+            min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom))
+            max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Remote Read:
+            avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
+            min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
+            max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Write and Atomic (32B):
+            avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
+            min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
+            max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Write and Atomic (Uncached):
+            avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom))
+            min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom))
+            max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Write and Atomic (64B):
+            avg: AVG((TCC_EA_WRREQ_64B_sum / $denom))
+            min: MIN((TCC_EA_WRREQ_64B_sum / $denom))
+            max: MAX((TCC_EA_WRREQ_64B_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          HBM Write and Atomic:
+            avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom))
+            min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom))
+            max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Remote Write and Atomic:
+            avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
+            min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
+            max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Atomic:
+            avg: AVG((TCC_EA_ATOMIC_sum / $denom))
+            min: MIN((TCC_EA_ATOMIC_sum / $denom))
+            max: MAX((TCC_EA_ATOMIC_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
\ No newline at end of file
diff --git a/src/omniperf_analyze/configs/gfx908/1800_L2_cache_per_channel.yaml b/src/omniperf_analyze/configs/gfx908/1800_L2_cache_per_channel.yaml
index 45f8abb41..54bf67dfc 100644
--- a/src/omniperf_analyze/configs/gfx908/1800_L2_cache_per_channel.yaml
+++ b/src/omniperf_analyze/configs/gfx908/1800_L2_cache_per_channel.yaml
@@ -167,7 +167,7 @@ Panel Config:
               + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
             unit: pct
             tips: 
-          Req:
+          L2 Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_REQ[0]) + TO_INT(TCC_REQ[1]))
               + TO_INT(TCC_REQ[2])) + TO_INT(TCC_REQ[3])) + TO_INT(TCC_REQ[4])) + TO_INT(TCC_REQ[5]))
               + TO_INT(TCC_REQ[6])) + TO_INT(TCC_REQ[7])) + TO_INT(TCC_REQ[8])) + TO_INT(TCC_REQ[9]))
@@ -206,7 +206,7 @@ Panel Config:
               + TO_INT(TCC_REQ[30])) + TO_INT(TCC_REQ[31])) / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L1 - L2 Read Req:
+          L2 Read Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_READ[0]) + TO_INT(TCC_READ[1]))
               + TO_INT(TCC_READ[2])) + TO_INT(TCC_READ[3])) + TO_INT(TCC_READ[4])) + TO_INT(TCC_READ[5]))
               + TO_INT(TCC_READ[6])) + TO_INT(TCC_READ[7])) + TO_INT(TCC_READ[8])) + TO_INT(TCC_READ[9]))
@@ -249,7 +249,7 @@ Panel Config:
               + TO_INT(TCC_READ[31])) / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L1 - L2 Write Req:
+          L2 Write Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_WRITE[0]) + TO_INT(TCC_WRITE[1]))
               + TO_INT(TCC_WRITE[2])) + TO_INT(TCC_WRITE[3])) + TO_INT(TCC_WRITE[4])) +
               TO_INT(TCC_WRITE[5])) + TO_INT(TCC_WRITE[6])) + TO_INT(TCC_WRITE[7])) + TO_INT(TCC_WRITE[8]))
@@ -296,7 +296,7 @@ Panel Config:
               + TO_INT(TCC_WRITE[30])) + TO_INT(TCC_WRITE[31])) / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L1 - L2 Atomic Req:
+          L2 Atomic Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_ATOMIC[0]) + TO_INT(TCC_ATOMIC[1]))
               + TO_INT(TCC_ATOMIC[2])) + TO_INT(TCC_ATOMIC[3])) + TO_INT(TCC_ATOMIC[4]))
               + TO_INT(TCC_ATOMIC[5])) + TO_INT(TCC_ATOMIC[6])) + TO_INT(TCC_ATOMIC[7]))
@@ -347,7 +347,7 @@ Panel Config:
               / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L2 - EA Read Req:
+          L2 - Fabric Read Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ[0]) + TO_INT(TCC_EA_RDREQ[1]))
               + TO_INT(TCC_EA_RDREQ[2])) + TO_INT(TCC_EA_RDREQ[3])) + TO_INT(TCC_EA_RDREQ[4]))
               + TO_INT(TCC_EA_RDREQ[5])) + TO_INT(TCC_EA_RDREQ[6])) + TO_INT(TCC_EA_RDREQ[7]))
@@ -398,7 +398,7 @@ Panel Config:
               / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L2 - EA Write Req:
+          L2 - Fabric Write and Atomic Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ[0]) + TO_INT(TCC_EA_WRREQ[1]))
               + TO_INT(TCC_EA_WRREQ[2])) + TO_INT(TCC_EA_WRREQ[3])) + TO_INT(TCC_EA_WRREQ[4]))
               + TO_INT(TCC_EA_WRREQ[5])) + TO_INT(TCC_EA_WRREQ[6])) + TO_INT(TCC_EA_WRREQ[7]))
@@ -449,7 +449,7 @@ Panel Config:
               / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L2 - EA Atomic Req:
+          L2 - Fabric Atomic Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_ATOMIC[0]) + TO_INT(TCC_EA_ATOMIC[1]))
               + TO_INT(TCC_EA_ATOMIC[2])) + TO_INT(TCC_EA_ATOMIC[3])) + TO_INT(TCC_EA_ATOMIC[4]))
               + TO_INT(TCC_EA_ATOMIC[5])) + TO_INT(TCC_EA_ATOMIC[6])) + TO_INT(TCC_EA_ATOMIC[7]))
@@ -500,7 +500,7 @@ Panel Config:
               / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L2 - EA Read Lat:
+          L2 - Fabric Read Lat:
             avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_RDREQ_LEVEL[0] + TCC_EA_RDREQ_LEVEL[1])
               + TCC_EA_RDREQ_LEVEL[2]) + TCC_EA_RDREQ_LEVEL[3]) + TCC_EA_RDREQ_LEVEL[4])
               + TCC_EA_RDREQ_LEVEL[5]) + TCC_EA_RDREQ_LEVEL[6]) + TCC_EA_RDREQ_LEVEL[7])
@@ -615,7 +615,7 @@ Panel Config:
               + TCC_EA_RDREQ[29]) + TCC_EA_RDREQ[30]) + TCC_EA_RDREQ[31]) != 0) else None))
             unit: Cycles
             tips: 
-          L2 - EA Write Lat:
+          L2 - Fabric Write Lat:
             avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_WRREQ_LEVEL[0] + TCC_EA_WRREQ_LEVEL[1])
               + TCC_EA_WRREQ_LEVEL[2]) + TCC_EA_WRREQ_LEVEL[3]) + TCC_EA_WRREQ_LEVEL[4])
               + TCC_EA_WRREQ_LEVEL[5]) + TCC_EA_WRREQ_LEVEL[6]) + TCC_EA_WRREQ_LEVEL[7])
@@ -730,7 +730,7 @@ Panel Config:
               + TCC_EA_WRREQ[29]) + TCC_EA_WRREQ[30]) + TCC_EA_WRREQ[31]) != 0) else None))
             unit: Cycles
             tips: 
-          L2 - EA Atomic Lat:
+          L2 - Fabric Atomic Lat:
             avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_ATOMIC_LEVEL[0] + TCC_EA_ATOMIC_LEVEL[1])
               + TCC_EA_ATOMIC_LEVEL[2]) + TCC_EA_ATOMIC_LEVEL[3]) + TCC_EA_ATOMIC_LEVEL[4])
               + TCC_EA_ATOMIC_LEVEL[5]) + TCC_EA_ATOMIC_LEVEL[6]) + TCC_EA_ATOMIC_LEVEL[7])
@@ -849,7 +849,7 @@ Panel Config:
               None))
             unit: Cycles
             tips: 
-          L2 - EA Read Stall (IO):
+          L2 - Fabric Read Stall (PCIe):
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[0])
               + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[1])) + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[2]))
               + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[3])) + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[4]))
@@ -920,7 +920,7 @@ Panel Config:
               + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[31])) / 32) / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Read Stall (GMI):
+          L2 - Fabric Read Stall (Infinity Fabric™):
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[0])
               + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[1])) + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[2]))
               + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[3])) + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[4]))
@@ -991,7 +991,7 @@ Panel Config:
               + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[31])) / 32) / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Read Stall (DRAM):
+          L2 - Fabric Read Stall (HBM):
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[0])
               + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[1])) + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[2]))
               + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[3])) + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[4]))
@@ -1062,7 +1062,7 @@ Panel Config:
               + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[31])) / 32) / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Write Stall (IO):
+          L2 - Fabric Write Stall (PCIe):
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[0])
               + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[1])) + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[2]))
               + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[3])) + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[4]))
@@ -1133,7 +1133,7 @@ Panel Config:
               + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[31])) / 32) / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Write Stall (GMI):
+          L2 - Fabric Write Stall (Infinity Fabric™):
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[0])
               + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[1])) + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[2]))
               + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[3])) + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[4]))
@@ -1204,7 +1204,7 @@ Panel Config:
               + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[31])) / 32) / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Write Stall (DRAM):
+          L2 - Fabric Write Stall (HBM):
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[0])
               + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[1])) + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[2]))
               + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[3])) + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[4]))
@@ -1275,7 +1275,7 @@ Panel Config:
               + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[31])) / 32) / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Write Starve:
+          L2 - Fabric Write Starve:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[0])
               + TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[1])) + TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[2]))
               + TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[3])) + TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[4]))
@@ -1354,22 +1354,22 @@ Panel Config:
           channel: Channel
           hit rate: L2 Cache Hit Rate (%)
           req: Requests (Requests)
-          read req: L1-L2 Read (Requests)
-          write req: L1-L2 Write (Requests)
-          atomic req: L1-L2 Atomic (Requests)
-          ea read req: L2-EA Read (Requests)
-          ea write req: L2-EA Write (Requests)
-          ea atomic req: L2-EA Atomic (Requests)
-          ea read lat - cycles: L2-EA Read Latency (Cycles)
-          ea write lat - cycles: L2-EA Write Latency (Cycles)
-          ea atomic lat - cycles: L2-EA Atomic Latency (Cycles)
-          ea read stall - io: L2-EA Read Stall - IO (Cycles per)
-          ea read stall - gmi: L2-EA Read Stall - GMI (Cycles per)
-          ea read stall - dram: L2-EA Read Stall - DRAM (Cycles per)
-          ea write stall - io: L2-EA Write Stall - IO (Cycles per)
-          ea write stall - gmi: L2-EA Write Stall - GMI (Cycles per)
-          ea write stall - dram: L2-EA Write Stall - DRAM (Cycles per)
-          ea write stall - starve: L2-EA Write Stall - Starve (Cycles per)
+          read req: L2 Read (Requests)
+          write req: L2 Write (Requests)
+          atomic req: L2 Atomic (Requests)
+          ea read req: L2-Fabric Read (Requests)
+          ea write req: L2-Fabric Write and Atomic (Requests)
+          ea atomic req: L2-Fabric Atomic (Requests)
+          ea read lat - cycles: L2-Fabric Read Latency (Cycles)
+          ea write lat - cycles: L2-Fabric Write Latency (Cycles)
+          ea atomic lat - cycles: L2-Fabric Atomic Latency (Cycles)
+          ea read stall - io: L2-Fabric Read Stall - PCIe (Cycles)
+          ea read stall - gmi: L2-Fabric Read Stall - Infinity Fabric™ (Cycles)
+          ea read stall - dram: L2-Fabric Read Stall - HBM (Cycles)
+          ea write stall - io: L2-Fabric Write Stall - PCIe (Cycles)
+          ea write stall - gmi: L2-Fabric Write Stall - Infinity Fabric™ (Cycles)
+          ea write stall - dram: L2-Fabric Write Stall - HBM (Cycles)
+          ea write stall - starve: L2-Fabric Write Stall - Starve (Cycles)
           tips: Tips
         metric:
           "0":
@@ -1829,22 +1829,22 @@ Panel Config:
           channel: Channel
           hit rate: L2 Cache Hit Rate (%)
           req: Requests (Requests)
-          read req: L1-L2 Read (Requests)
-          write req: L1-L2 Write (Requests)
-          atomic req: L1-L2 Atomic (Requests)
-          ea read req: L2-EA Read (Requests)
-          ea write req: L2-EA Write (Requests)
-          ea atomic req: L2-EA Atomic (Requests)
-          ea read lat - cycles: L2-EA Read Latency (Cycles)
-          ea write lat - cycles: L2-EA Write Latency (Cycles)
-          ea atomic lat - cycles: L2-EA Atomic Latency (Cycles)
-          ea read stall - io: L2-EA Read Stall - IO (Cycles per)
-          ea read stall - gmi: L2-EA Read Stall - GMI (Cycles per)
-          ea read stall - dram: L2-EA Read Stall - DRAM (Cycles per)
-          ea write stall - io: L2-EA Write Stall - IO (Cycles per)
-          ea write stall - gmi: L2-EA Write Stall - GMI (Cycles per)
-          ea write stall - dram: L2-EA Write Stall - DRAM (Cycles per)
-          ea write stall - starve: L2-EA Write Stall - Starve (Cycles per)
+          read req: L2 Read (Requests)
+          write req: L2 Write (Requests)
+          atomic req: L2 Atomic (Requests)
+          ea read req: L2-Fabric Read (Requests)
+          ea write req: L2-Fabric Write and Atomic (Requests)
+          ea atomic req: L2-Fabric Atomic (Requests)
+          ea read lat - cycles: L2-Fabric Read Latency (Cycles)
+          ea write lat - cycles: L2-Fabric Write Latency (Cycles)
+          ea atomic lat - cycles: L2-Fabric Atomic Latency (Cycles)
+          ea read stall - io: L2-Fabric Read Stall - PCIe (Cycles)
+          ea read stall - gmi: L2-Fabric Read Stall - Infinity Fabric™ (Cycles)
+          ea read stall - dram: L2-Fabric Read Stall - HBM (Cycles)
+          ea write stall - io: L2-Fabric Write Stall - PCIe (Cycles)
+          ea write stall - gmi: L2-Fabric Write Stall - Infinity Fabric™ (Cycles)
+          ea write stall - dram: L2-Fabric Write Stall - HBM (Cycles)
+          ea write stall - starve: L2-Fabric Write Stall - Starve (Cycles)
           tips: Tips
         metric:
           "16":
diff --git a/src/omniperf_analyze/configs/gfx90a/0200_system-speed-of-light.yaml b/src/omniperf_analyze/configs/gfx90a/0200_system-speed-of-light.yaml
index c197c0fc5..4f27676a2 100644
--- a/src/omniperf_analyze/configs/gfx90a/0200_system-speed-of-light.yaml
+++ b/src/omniperf_analyze/configs/gfx90a/0200_system-speed-of-light.yaml
@@ -14,10 +14,10 @@ Panel Config:
         title: Speed-of-Light
         header:
           metric: Metric
-          value: Value
+          value: Avg
           unit: Unit
           peak: Peak
-          pop: PoP
+          pop: Pct of Peak
           tips: Tips
         metric:
           VALU FLOPs:
@@ -83,19 +83,19 @@ Panel Config:
             peak: $numCU
             pop: ((100 * $numActiveCUs) / $numCU)
             tips: 
-          SALU Util:
+          SALU Utilization:
             value: AVG(((100 * SQ_ACTIVE_INST_SCA) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             peak: 100
             pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / (GRBM_GUI_ACTIVE * $numCU)))
             tips: 
-          VALU Util:
+          VALU Utilization:
             value: AVG(((100 * SQ_ACTIVE_INST_VALU) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             peak: 100
             pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / (GRBM_GUI_ACTIVE * $numCU)))
             tips: 
-          MFMA Util:
+          MFMA Utilization:
             value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((GRBM_GUI_ACTIVE * $numCU)
               * 4)))
             unit: pct
@@ -103,7 +103,20 @@ Panel Config:
             pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((GRBM_GUI_ACTIVE * $numCU)
               * 4)))
             tips: 
-          VALU Active Threads/Wave:
+          VMEM Utilization:
+            value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / GRBM_GUI_ACTIVE) / $numCU))
+            unit: pct
+            peak: 100
+            pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / GRBM_GUI_ACTIVE) / $numCU))
+            tips: 
+          Branch Utilization:
+            value: AVG((((100 * SQ_ACTIVE_INST_MISC) / GRBM_GUI_ACTIVE) / $numCU))
+            unit: pct
+            peak: 100
+            pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / GRBM_GUI_ACTIVE) / $numCU))
+            unit: pct
+            tips: 
+          VALU Active Threads:
             value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
               != 0) else None))
             unit: Threads
@@ -111,25 +124,29 @@ Panel Config:
             pop: (AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
               != 0) else None)) * 1.5625)
             tips: 
-          IPC - Issue:
-            value: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)
-              + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED)
-              / SQ_ACTIVE_INST_ANY))
+          IPC:
+            value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
             unit: Instr/cycle
             peak: 5
-            pop: ((100 * AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)
-              + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED)
-              / SQ_ACTIVE_INST_ANY))) / 5)
+            pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
+            tips: 
+          Wavefront Occupancy:
+            value: AVG((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE))
+            unit: Wavefronts
+            peak: ($maxWavesPerCU * $numCU)
+            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE) / ($maxWavesPerCU
+              * $numCU))))
+            coll_level: SQ_LEVEL_WAVES
             tips: 
-          LDS BW:
+          Theoretical LDS Bandwidth:
             value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks))
               / (EndNs - BeginNs)))
-            unit: GB/sec
+            unit: GB/s
             peak: (($sclk * $numCU) * 0.128)
             pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks))
               / (EndNs - BeginNs)) / (($sclk * $numCU) * 0.00128)))
             tips: 
-          LDS Bank Conflict:
+          LDS Bank Conflicts/Access:
             value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
               if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
             unit: Conflicts/access
@@ -137,35 +154,7 @@ Panel Config:
             pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
               if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32)
             tips: 
-          Instr Cache Hit Rate:
-            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-            tips: 
-          Instr Cache BW:
-            value: AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64))
-            unit: GB/s
-            peak: ((($sclk / 1000) * 64) * $numSQC)
-            pop: ((100 * AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk
-              / 1000) * 64) * $numSQC))
-            tips: 
-          Scalar L1D Cache Hit Rate:
-            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-            tips: 
-          Scalar L1D Cache BW:
-            value: AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))
-            unit: GB/s
-            peak: ((($sclk / 1000) * 64) * $numSQC)
-            pop: ((100 * AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk
-              / 1000) * 64) * $numSQC))
-            tips: 
-          Vector L1D Cache Hit Rate:
+          vL1D Cache Hit Rate:
             value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
               + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
               / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
@@ -177,7 +166,7 @@ Panel Config:
               TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
               None))
             tips: 
-          Vector L1D Cache BW:
+          vL1D Cache BW:
             value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs)))
             unit: GB/s
             peak: ((($sclk / 1000) * 64) * $numCU)
@@ -192,6 +181,13 @@ Panel Config:
             pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
               + TCC_MISS_sum) != 0) else None))
             tips: 
+          L2 Cache BW:
+            value: AVG(((TCC_REQ_sum * 128) / (EndNs - BeginNs)))
+            unit: GB/s
+            peak: ((($sclk / 1000) * 128) * TO_INT($L2Banks))
+            pop: ((100 * AVG(((TCC_REQ_sum * 128) / (EndNs - BeginNs))))
+              / ((($sclk / 1000) * 128) * TO_INT($L2Banks)))
+            tips: 
           L2-Fabric Read BW:
             value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
               * 64)) / (EndNs - BeginNs)))
@@ -212,36 +208,48 @@ Panel Config:
             value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
               != 0) else None))
             unit: Cycles
-            peak: ''
-            pop: ''
+            peak: None
+            pop: None
             tips: 
           L2-Fabric Write Latency:
             value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
               != 0) else None))
             unit: Cycles
-            peak: ''
-            pop: ''
+            peak: None
+            pop: None
             tips: 
-          Wave Occupancy:
-            value: AVG((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE))
-            unit: Wavefronts
-            peak: ($maxWavesPerCU * $numCU)
-            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / GRBM_GUI_ACTIVE) / ($maxWavesPerCU
-              * $numCU))))
-            coll_level: SQ_LEVEL_WAVES
+          sL1D Cache Hit Rate:
+            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
+              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
+            unit: pct
+            peak: 100
+            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
+              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
             tips: 
-          Instr Fetch BW:
-            value: AVG(((SQ_IFETCH / (EndNs - BeginNs)) * 32))
+          sL1D Cache BW:
+            value: AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))
             unit: GB/s
-            peak: ((($sclk / 1000) * 32) * $numSQC)
-            pop: ((100 * AVG(((SQ_IFETCH / (EndNs - BeginNs)) * 32))) / ($numSQC
-              * (($sclk / 1000) * 32)))
-            coll_level: SQ_IFETCH_LEVEL
+            peak: ((($sclk / 1000) * 64) * $numSQC)
+            pop: ((100 * AVG(((SQC_DCACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk
+              / 1000) * 64) * $numSQC))
+            tips: 
+          L1I Hit Rate:
+            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
+            unit: pct
+            peak: 100
+            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
+            tips: 
+          L1I BW:
+            value: AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64))
+            unit: GB/s
+            peak: ((($sclk / 1000) * 64) * $numSQC)
+            pop: ((100 * AVG(((SQC_ICACHE_REQ / (EndNs - BeginNs)) * 64))) / ((($sclk
+              / 1000) * 64) * $numSQC))
             tips: 
-          Instr Fetch Latency:
+          L1I Fetch Latency:
             value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
             unit: Cycles
-            peak: ''
-            pop: ''
+            peak: None
+            pop: None
             coll_level: SQ_IFETCH_LEVEL
             tips:
diff --git a/src/omniperf_analyze/configs/gfx90a/0500_command-processor.yaml b/src/omniperf_analyze/configs/gfx90a/0500_command-processor.yaml
index d954f6162..b4a1f0b10 100644
--- a/src/omniperf_analyze/configs/gfx90a/0500_command-processor.yaml
+++ b/src/omniperf_analyze/configs/gfx90a/0500_command-processor.yaml
@@ -19,19 +19,7 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          GPU Busy Cycles:
-            avg: AVG(GRBM_GUI_ACTIVE)
-            min: MIN(GRBM_GUI_ACTIVE)
-            max: MAX(GRBM_GUI_ACTIVE)
-            unit: Cycles/Kernel
-            tips: 
-          CPF Busy:
-            avg: AVG(CPF_CPF_STAT_BUSY)
-            min: MIN(CPF_CPF_STAT_BUSY)
-            max: MAX(CPF_CPF_STAT_BUSY)
-            unit: Cycles/Kernel
-            tips: 
-          CPF Util:
+          CPF Utilization:
             avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
               if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
             min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
@@ -47,15 +35,9 @@ Panel Config:
               != 0) else None))
             max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
               != 0) else None))
-            unit: Cycles/Kernel
-            tips: 
-          L2Cache Intf Busy:
-            avg: AVG(CPF_CPF_TCIU_BUSY)
-            min: MIN(CPF_CPF_TCIU_BUSY)
-            max: MAX(CPF_CPF_TCIU_BUSY)
-            unit: Cycles/Kernel
+            unit: pct
             tips: 
-          L2Cache Intf Util:
+          CPF-L2 Utilization:
             avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
               if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
             min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
@@ -64,7 +46,7 @@ Panel Config:
               if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
             unit: pct
             tips: 
-          L2Cache Intf Stall:
+          CPF-L2 Stall:
             avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
               != 0) else None))
             min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
@@ -73,16 +55,19 @@ Panel Config:
               != 0) else None))
             unit: pct
             tips: 
-          UTCL1 Stall:
-            avg: AVG(CPF_CMP_UTCL1_STALL_ON_TRANSLATION)
-            min: MIN(CPF_CMP_UTCL1_STALL_ON_TRANSLATION)
-            max: MAX(CPF_CMP_UTCL1_STALL_ON_TRANSLATION)
-            unit: Cycles/Kernel
-            tips: 
+          CPF-UTCL1 Stall:
+            avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
+              != 0) else None)
+            min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
+              != 0) else None)
+            max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
+              != 0) else None)
+            unit: pct
+            tips:
 
     - metric_table:
         id: 502
-        title: Command Processor Compute
+        title: Packet Processor
         header:
           metric: Metric
           avg: Avg
@@ -91,19 +76,7 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          GPU Busy Cycles:
-            avg: AVG(GRBM_GUI_ACTIVE)
-            min: MIN(GRBM_GUI_ACTIVE)
-            max: MAX(GRBM_GUI_ACTIVE)
-            unit: Cycles
-            tips: 
-          CPC Busy Cycles:
-            avg: AVG(CPC_CPC_STAT_BUSY)
-            min: MIN(CPC_CPC_STAT_BUSY)
-            max: MAX(CPC_CPC_STAT_BUSY)
-            unit: Cycles
-            tips: 
-          CPC Util:
+          CPC Utilization:
             avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
               if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
             min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
@@ -112,12 +85,6 @@ Panel Config:
               if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
             unit: pct
             tips: 
-          CPC Stall Cycles:
-            avg: AVG(CPC_CPC_STAT_STALL)
-            min: MIN(CPC_CPC_STAT_STALL)
-            max: MAX(CPC_CPC_STAT_STALL)
-            unit: Cycles
-            tips: 
           CPC Stall Rate:
             avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
               != 0) else None))
@@ -127,28 +94,19 @@ Panel Config:
               != 0) else None))
             unit: pct
             tips: 
-          CPC Packet Decoding:
-            avg: AVG(CPC_ME1_BUSY_FOR_PACKET_DECODE)
-            min: MIN(CPC_ME1_BUSY_FOR_PACKET_DECODE)
-            max: MAX(CPC_ME1_BUSY_FOR_PACKET_DECODE)
-            unit: Cycles
-            tips: 
-          SPI Intf Busy Cycles:
-            avg: AVG(CPC_ME1_DC0_SPI_BUSY)
-            min: MIN(CPC_ME1_DC0_SPI_BUSY)
-            max: MAX(CPC_ME1_DC0_SPI_BUSY)
-            unit: Cycles
-            tips: 
-          SPI Intf Util:
-            avg: AVG((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
+          CPC Packet Decoding Utilization:
+            avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
+            min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
+            max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
             unit: pct
             tips: 
-          L2Cache Intf Util:
+          CPC-Workgroup Manager Utilization:
+            avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
+            min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
+            max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None)
+            unit: Pct
+            tips: 
+          CPC-L2 Utilization:
             avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
               if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
             min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
@@ -157,19 +115,16 @@ Panel Config:
               if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
             unit: pct
             tips: 
-          UTCL1 Stall Cycles:
-            avg: AVG(CPC_UTCL1_STALL_ON_TRANSLATION)
-            min: MIN(CPC_UTCL1_STALL_ON_TRANSLATION)
-            max: MAX(CPC_UTCL1_STALL_ON_TRANSLATION)
-            unit: Cycles
-            tips: 
-          UTCL2 Intf Busy Cycles:
-            avg: AVG(CPC_CPC_UTCL2IU_BUSY)
-            min: MIN(CPC_CPC_UTCL2IU_BUSY)
-            max: MAX(CPC_CPC_UTCL2IU_BUSY)
-            unit: Cycles
+          CPC-UTCL1 Stall:
+            avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
+              != 0) else None)
+            min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
+              != 0) else None)
+            max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
+              != 0) else None)
+            unit: pct
             tips: 
-          UTCL2 Intf Util:
+          CPC-UTCL2 Utilization:
             avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
               if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
             min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
diff --git a/src/omniperf_analyze/configs/gfx90a/0600_shader-processor-input.yaml b/src/omniperf_analyze/configs/gfx90a/0600_shader-processor-input.yaml
index bab48700a..24d4036ec 100644
--- a/src/omniperf_analyze/configs/gfx90a/0600_shader-processor-input.yaml
+++ b/src/omniperf_analyze/configs/gfx90a/0600_shader-processor-input.yaml
@@ -6,11 +6,11 @@ Metric Description:
 # Define the panel properties and properties of each metric in the panel.
 Panel Config:
   id: 600
-  title: Shader Processor Input (SPI)
+  title: Workgroup Manager (SPI)
   data source:
     - metric_table:
         id: 601
-        title: SPI Stats
+        title: Workgroup Manager Utilizations
         header:
           metric: Metric
           avg: Avg
@@ -19,29 +19,35 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          GPU Busy:
-            avg: AVG(GRBM_GUI_ACTIVE)
-            min: MIN(GRBM_GUI_ACTIVE)
-            max: MAX(GRBM_GUI_ACTIVE)
-            unit: Cycles
-            tips: 
-          CS Busy:
-            avg: AVG(SPI_CSN_BUSY)
-            min: MIN(SPI_CSN_BUSY)
-            max: MAX(SPI_CSN_BUSY)
-            unit: Cycles
-            tips: 
-          SPI Busy:
-            avg: AVG(GRBM_SPI_BUSY)
-            min: MIN(GRBM_SPI_BUSY)
-            max: MAX(GRBM_SPI_BUSY)
-            unit: Cycles
-            tips: 
-          SQ Busy:
-            avg: AVG(SQ_BUSY_CYCLES)
-            min: MIN(SQ_BUSY_CYCLES)
-            max: MAX(SQ_BUSY_CYCLES)
-            unit: Cycles
+          Accelerator Utilization:
+            avg: AVG(100 * GRBM_GUI_ACTIVE / GRBM_COUNT)
+            min: MIN(100 * GRBM_GUI_ACTIVE / GRBM_COUNT)
+            max: MAX(100 * GRBM_GUI_ACTIVE / GRBM_COUNT)
+            unit: Pct
+            tips: 
+          Scheduler-Pipe Utilization:
+            avg: AVG(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE))
+            min: MIN(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE))
+            max: MAX(100 * SPI_CSN_BUSY / (GRBM_GUI_ACTIVE * $numPipes * $numSE))
+            unit: Pct
+            tips: 
+          Workgroup Manager Utilization:
+            avg: AVG(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE)
+            min: MIN(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE)
+            max: MAX(100 * GRBM_SPI_BUSY / GRBM_GUI_ACTIVE)
+            unit: Pct
+            tips: 
+          Shader Engine Utilization:
+            avg: AVG(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE))
+            min: MIN(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE))
+            max: MAX(100 * SQ_BUSY_CYCLES / (GRBM_GUI_ACTIVE * $numSE))
+            unit: Pct
+            tips: 
+          SIMD Utilization:
+            avg: AVG(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(100 * SQ_BUSY_CU_CYCLES / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
             tips: 
           Dispatched Workgroups:
             avg: AVG(SPI_CSN_NUM_THREADGROUPS)
@@ -55,22 +61,27 @@ Panel Config:
             max: MAX(SPI_CSN_WAVE)
             unit: Wavefronts
             tips: 
-          Wave Alloc Failed:
-            avg: AVG(SPI_RA_REQ_NO_ALLOC)
-            min: MIN(SPI_RA_REQ_NO_ALLOC)
-            max: MAX(SPI_RA_REQ_NO_ALLOC)
-            unit: Cycles
-            tips: 
-          Wave Alloc Failed - CS:
-            avg: AVG(SPI_RA_REQ_NO_ALLOC_CSN)
-            min: MIN(SPI_RA_REQ_NO_ALLOC_CSN)
-            max: MAX(SPI_RA_REQ_NO_ALLOC_CSN)
-            unit: Cycles
+          VGPR Writes:
+            avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            unit: Cycles/wave
             tips: 
-
+          SGPR Writes:
+            avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
+              None))
+            unit: Cycles/wave
+            tips:
     - metric_table:
         id: 602
-        title: SPI Resource Allocation
+        title: Workgroup Manager - Resource Allocation
         header:
           metric: Metric
           avg: Avg
@@ -79,96 +90,78 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          Wave request Failed (CS):
-            avg: AVG(SPI_RA_REQ_NO_ALLOC_CSN)
-            min: MIN(SPI_RA_REQ_NO_ALLOC_CSN)
-            max: MAX(SPI_RA_REQ_NO_ALLOC_CSN)
-            unit: Cycles
-            tips: 
-          CS Stall:
-            avg: AVG(SPI_RA_RES_STALL_CSN)
-            min: MIN(SPI_RA_RES_STALL_CSN)
-            max: MAX(SPI_RA_RES_STALL_CSN)
-            unit: Cycles
-            tips: 
-          CS Stall Rate:
-            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY !=
+          Not-scheduled Rate (Workgroup Manager):
+            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            unit: Pct
+            tips: 
+          Not-scheduled Rate (Scheduler-Pipe):
+            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            min: MIN((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            max: MAX((100 * SPI_RA_REQ_NO_ALLOC / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
+              0) else None)
+            unit: Pct
+            tips: 
+          Scheduler-Pipe Stall Rate:
+            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
               0) else None))
-            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY !=
+            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
               0) else None))
-            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / GRBM_SPI_BUSY) if (GRBM_SPI_BUSY !=
+            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY !=
               0) else None))
-            unit: pct
+            unit: Pct
             tips: 
-          Scratch Stall:
-            avg: AVG(SPI_RA_TMP_STALL_CSN)
-            min: MIN(SPI_RA_TMP_STALL_CSN)
-            max: MAX(SPI_RA_TMP_STALL_CSN)
-            unit: Cycles
+          Scratch Stall Rate:
+            avg: AVG((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)
+            min: MIN((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)
+            max: MAX((100 * SPI_RA_TMP_STALL_CSN / (GRBM_SPI_BUSY * $numSE)) if (GRBM_SPI_BUSY != 0) else None)
+            unit: Pct
             tips: 
           Insufficient SIMD Waveslots:
-            avg: AVG(SPI_RA_WAVE_SIMD_FULL_CSN)
-            min: MIN(SPI_RA_WAVE_SIMD_FULL_CSN)
-            max: MAX(SPI_RA_WAVE_SIMD_FULL_CSN)
-            unit: SIMD
+            avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
             tips: 
           Insufficient SIMD VGPRs:
-            avg: AVG(SPI_RA_VGPR_SIMD_FULL_CSN)
-            min: MIN(SPI_RA_VGPR_SIMD_FULL_CSN)
-            max: MAX(SPI_RA_VGPR_SIMD_FULL_CSN)
-            unit: SIMD
+            avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
             tips: 
           Insufficient SIMD SGPRs:
-            avg: AVG(SPI_RA_SGPR_SIMD_FULL_CSN)
-            min: MIN(SPI_RA_SGPR_SIMD_FULL_CSN)
-            max: MAX(SPI_RA_SGPR_SIMD_FULL_CSN)
-            unit: SIMD
+            avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
             tips: 
           Insufficient CU LDS:
-            avg: AVG(SPI_RA_LDS_CU_FULL_CSN)
-            min: MIN(SPI_RA_LDS_CU_FULL_CSN)
-            max: MAX(SPI_RA_LDS_CU_FULL_CSN)
-            unit: CU
-            tips: 
-          Insufficient CU Barries:
-            avg: AVG(SPI_RA_BAR_CU_FULL_CSN)
-            min: MIN(SPI_RA_BAR_CU_FULL_CSN)
-            max: MAX(SPI_RA_BAR_CU_FULL_CSN)
-            unit: CU
-            tips: 
-          Insufficient Bulky Resource:
-            avg: AVG(SPI_RA_BULKY_CU_FULL_CSN)
-            min: MIN(SPI_RA_BULKY_CU_FULL_CSN)
-            max: MAX(SPI_RA_BULKY_CU_FULL_CSN)
-            unit: CU
-            tips: 
-          Reach CU Threadgroups Limit:
-            avg: AVG(SPI_RA_TGLIM_CU_FULL_CSN)
-            min: MIN(SPI_RA_TGLIM_CU_FULL_CSN)
-            max: MAX(SPI_RA_TGLIM_CU_FULL_CSN)
-            unit: Cycles
-            tips: 
-          Reach CU Wave Limit:
-            avg: AVG(SPI_RA_WVLIM_STALL_CSN)
-            min: MIN(SPI_RA_WVLIM_STALL_CSN)
-            max: MAX(SPI_RA_WVLIM_STALL_CSN)
-            unit: Cycles
+            avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
+            tips: 
+          Insufficient CU Barriers:
+            avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
+            tips: 
+          Reached CU Workgroup Limit:
+            avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
+            tips: 
+          Reached CU Wavefront Limit:
+            avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / (GRBM_GUI_ACTIVE * $numCU))
+            unit: Pct
             tips: 
-          VGPR Writes:
-            avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            unit: Cycles/wave
-            tips: 
-          SGPR Writes:
-            avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
-              None))
-            unit: Cycles/wave
-            tips:
diff --git a/src/omniperf_analyze/configs/gfx90a/0700_wavefront-launch.yaml b/src/omniperf_analyze/configs/gfx90a/0700_wavefront-launch.yaml
index 13ba5b8e1..5ab83270f 100644
--- a/src/omniperf_analyze/configs/gfx90a/0700_wavefront-launch.yaml
+++ b/src/omniperf_analyze/configs/gfx90a/0700_wavefront-launch.yaml
@@ -77,7 +77,7 @@ Panel Config:
             avg: AVG(scr)
             min: MIN(scr)
             max: MAX(scr)
-            unit: Bytes
+            unit: Bytes/Workitem
             tips: 
 
     - metric_table:
@@ -103,7 +103,7 @@ Panel Config:
             max: MAX(GRBM_GUI_ACTIVE)
             unit: Cycle
             tips: 
-          Instr/wavefront:
+          Instructions per wavefront:
             avg: AVG((SQ_INSTS / SQ_WAVES))
             min: MIN((SQ_INSTS / SQ_WAVES))
             max: MAX((SQ_INSTS / SQ_WAVES))
diff --git a/src/omniperf_analyze/configs/gfx90a/1000_compute-unit-instruction-mix.yaml b/src/omniperf_analyze/configs/gfx90a/1000_compute-unit-instruction-mix.yaml
index 8ffd87d2c..f7867b6ea 100644
--- a/src/omniperf_analyze/configs/gfx90a/1000_compute-unit-instruction-mix.yaml
+++ b/src/omniperf_analyze/configs/gfx90a/1000_compute-unit-instruction-mix.yaml
@@ -10,7 +10,7 @@ Panel Config:
   data source:
     - metric_table:
         id: 1001
-        title: Instruction Mix
+        title: Overall Instruction Mix
         header:
           metric: Metric
           avg: Avg
@@ -22,7 +22,7 @@ Panel Config:
           type: simple_bar
           label_txt: (# of instr + $normUnit)
         metric:
-          VALU - Vector:
+          VALU:
             avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
             min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
             max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
@@ -40,7 +40,7 @@ Panel Config:
             max: MAX((SQ_INSTS_LDS / $denom))
             unit: (instr + $normUnit)
             tips: 
-          VALU - MFMA:
+          MFMA:
             avg: AVG((SQ_INSTS_MFMA / $denom))
             min: MIN((SQ_INSTS_MFMA / $denom))
             max: MAX((SQ_INSTS_MFMA / $denom))
@@ -64,12 +64,6 @@ Panel Config:
             max: MAX((SQ_INSTS_BRANCH / $denom))
             unit: (instr + $normUnit)
             tips: 
-          GDS:
-            avg: AVG((SQ_INSTS_GDS / $denom))
-            min: MIN((SQ_INSTS_GDS / $denom))
-            max: MAX((SQ_INSTS_GDS / $denom))
-            unit: (instr + $normUnit)
-            tips: 
 
     - metric_table:
         id: 1002
@@ -180,55 +174,100 @@ Panel Config:
         id: 1003
         title: VMEM Instr Mix
         header:
-          type: type
-          count: Count
+          metric: Metric
+          avg: Avg
+          min: Min
+          max: Max
+          unit: Unit
           tips: Tips
         metric:
-          Buffer Instr:
-            count: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            tips: 
-          Buffer Read:
-            count: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            tips: 
-          Buffer Write:
-            count: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            tips: 
-          Buffer Atomic:
-            count: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            tips: 
-          Flat Instr:
-            count: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
-            tips: 
-          Flat Read:
-            count: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            tips: 
-          Flat Write:
-            count: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            tips: 
-          Flat Atomic:
-            count: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            tips: 
+          Global/Generic Instr:
+            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
+          Global/Generic Read:
+            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
+          Global/Generic Write:
+            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
+          Global/Generic Atomic:
+            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
+          Spill/Stack Instr:
+            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
+          Spill/Stack Read:
+            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
+          Spill/Stack Write:
+            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
+          Spill/Stack Atomic:
+            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
+            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
+            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
+            unit: (instr + $normUnit)
+            tips:
 
     - metric_table:
         id: 1004
         title: MFMA Arithmetic Instr Mix
         header:
-          type: type
-          count: Count
+          metric: Metric
+          avg: Avg
+          min: Min
+          max: Max
+          unit: Unit
           tips: Tips
         metric:
           MFMA-I8:
-            count: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
+            avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
+            min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom))
+            max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
+            unit: (instr + $normUnit)
             tips: 
           MFMA-F16:
-            count: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
+            avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
+            min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
+            max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom))
+            unit: (instr + $normUnit)
             tips: 
           MFMA-BF16:
-            count: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
+            avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
+            min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom))
+            max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom))
+            unit: (instr + $normUnit)
             tips: 
           MFMA-F32:
-            count: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
-            tips: 
+            avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
+            min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom))
+            max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom))
+            unit: (instr + $normUnit)
+            tips:
           MFMA-F64:
-            count: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
-            tips: 
+            avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
+            min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
+            max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
+            unit: (instr + $normUnit)
+            tips:
diff --git a/src/omniperf_analyze/configs/gfx90a/1100_compute-unit-compute-pipeline.yaml b/src/omniperf_analyze/configs/gfx90a/1100_compute-unit-compute-pipeline.yaml
index 39a144731..04b7d6027 100644
--- a/src/omniperf_analyze/configs/gfx90a/1100_compute-unit-compute-pipeline.yaml
+++ b/src/omniperf_analyze/configs/gfx90a/1100_compute-unit-compute-pipeline.yaml
@@ -13,8 +13,10 @@ Panel Config:
         title: Speed-of-Light
         header:
           metric: Metric
-          value: Value
+          value: Avg
           unit: Unit
+          peak: Peak
+          pop: Pct of Peak
           tips: Tips
         style:
           type: simple_bar
@@ -22,39 +24,62 @@ Panel Config:
           label_txt: (%)
           xrange: [0, 110]
         metric:
-          valu_flops_pop:
-            value: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
+          VALU FLOPs:
+            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16)
+              + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32)
+              + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64
+              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64))))
+              / (EndNs - BeginNs)))
+            unit: GFLOP
+            peak: (((($sclk * $numCU) * 64) * 2) / 1000)
+            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
               + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
               + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
               + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
               + (2 * SQ_INSTS_VALU_FMA_F64)))) / (EndNs - BeginNs)))) / (((($sclk
               * $numCU) * 64) * 2) / 1000))
-            unit: Pct of Peak
             tips: 
-          mfma_flops_bf16_pop:
-            value: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (EndNs - BeginNs))))
-              / ((($sclk * $numCU) * 512) / 1000))
-            unit: Pct of Peak
+          VALU IOPs:
+            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (EndNs - BeginNs)))
+            unit: GIOP
+            peak: (((($sclk * $numCU) * 64) * 2) / 1000)
+            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (EndNs
+              - BeginNs)))) / (((($sclk * $numCU) * 64) * 2) / 1000))
             tips: 
-          mfma_flops_f16_pop:
-            value: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (EndNs - BeginNs))))
+          MFMA FLOPs (BF16):
+            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (EndNs - BeginNs)))
+            unit: GFLOP
+            peak: ((($sclk * $numCU) * 1024) / 1000)
+            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (EndNs - BeginNs))))
               / ((($sclk * $numCU) * 1024) / 1000))
-            unit: Pct of Peak
             tips: 
-          mfma_flops_f32_pop:
-            value: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (EndNs - BeginNs))))
+          MFMA FLOPs (F16):
+            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (EndNs - BeginNs)))
+            unit: GFLOP
+            peak: ((($sclk * $numCU) * 1024) / 1000)
+            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (EndNs - BeginNs))))
+              / ((($sclk * $numCU) * 1024) / 1000))
+            tips: 
+          MFMA FLOPs (F32):
+            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (EndNs - BeginNs)))
+            unit: GFLOP
+            peak: ((($sclk * $numCU) * 256) / 1000)
+            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (EndNs - BeginNs))))
               / ((($sclk * $numCU) * 256) / 1000))
-            unit: Pct of Peak
             tips: 
-          mfma_flops_f64_pop:
-            value: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (EndNs - BeginNs))))
+          MFMA FLOPs (F64):
+            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (EndNs - BeginNs)))
+            unit: GFLOP
+            peak: ((($sclk * $numCU) * 256) / 1000)
+            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (EndNs - BeginNs))))
               / ((($sclk * $numCU) * 256) / 1000))
-            unit: Pct of Peak
             tips: 
-          mfma_flops_i8_pop:
-            value: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (EndNs - BeginNs))))
+          MFMA IOPs (INT8):
+            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (EndNs - BeginNs)))
+            unit: GIOP
+            peak: ((($sclk * $numCU) * 1024) / 1000)
+            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (EndNs - BeginNs))))
               / ((($sclk * $numCU) * 1024) / 1000))
-            unit: Pct of Peak
             tips: 
 
     - metric_table:
@@ -68,36 +93,48 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          IPC (Avg):
+          IPC:
             avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
             min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
             max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
             unit: Instr/cycle
             tips: 
-          IPC (Issue):
-            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)
-              + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED)
+          IPC (Issued):
+            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) 
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
-            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)
-              + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED)
+            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) 
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
-            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)
-              + SQ_INSTS_GDS) + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED)
+            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) 
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             unit: Instr/cycle
             tips: 
-          SALU Util:
+          SALU Utilization:
             avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU))
             min: MIN((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU))
             max: MAX((((100 * SQ_ACTIVE_INST_SCA) / GRBM_GUI_ACTIVE) / $numCU))
             unit: pct
             tips: 
-          VALU Util:
+          VALU Utilization:
             avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU))
             min: MIN((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU))
             max: MAX((((100 * SQ_ACTIVE_INST_VALU) / GRBM_GUI_ACTIVE) / $numCU))
             unit: pct
             tips: 
+          VMEM Utilization:
+            avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / GRBM_GUI_ACTIVE) / $numCU))
+            min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / GRBM_GUI_ACTIVE) / $numCU))
+            max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / GRBM_GUI_ACTIVE) / $numCU))
+            unit: pct
+            tips: 
+          Branch Utilization:
+            avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / GRBM_GUI_ACTIVE) / $numCU))
+            min: MIN((((100 * SQ_ACTIVE_INST_MISC) / GRBM_GUI_ACTIVE) / $numCU))
+            max: MAX((((100 * SQ_ACTIVE_INST_MISC) / GRBM_GUI_ACTIVE) / $numCU))
+            unit: pct
+            tips: 
           VALU Active Threads:
             avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
               != 0) else None))
@@ -107,7 +144,7 @@ Panel Config:
               != 0) else None))
             unit: Threads
             tips: 
-          MFMA Util:
+          MFMA Utilization:
             avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $numCU) * GRBM_GUI_ACTIVE)))
             min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $numCU) * GRBM_GUI_ACTIVE)))
             max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $numCU) * GRBM_GUI_ACTIVE)))
@@ -122,6 +159,26 @@ Panel Config:
               else None))
             unit: cycles/instr
             tips: 
+          VMEM Latency:
+            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
+              else None))
+            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
+              else None))
+            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
+              else None))
+            unit: Cycles
+            coll_level: SQ_INST_LEVEL_VMEM
+            tips:
+          SMEM Latency:
+            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
+              else None))
+            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
+              else None))
+            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
+              else None))
+            unit: Cycles
+            coll_level: SQ_INST_LEVEL_SMEM
+            tips:
 
     - metric_table:
         id: 1103
@@ -158,10 +215,10 @@ Panel Config:
               $denom))
             unit: (OPs  + $normUnit)
             tips: 
-          INT8 OPs:
-            avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
+          IOPs (Total):
+            avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom)
+            min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom)
+            max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom)
             unit: (OPs  + $normUnit)
             tips: 
           F16 OPs:
@@ -200,52 +257,9 @@ Panel Config:
               + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
             unit: (OPs  + $normUnit)
             tips: 
-
-    - metric_table:
-        id: 1104
-        title: Memory Latencies
-        header:
-          metric: Metric
-          avg: Avg
-          min: Min
-          max: Max
-          unit: Unit
-          tips: Tips
-        metric:
-          VMEM Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            unit: Cycles
-            tips: SQ_INSTS_LEVEL_VMEM
-          SMEM Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            unit: Cycles
-            tips: SQ_INSTS_LEVEL_SMEM
-          Instr Fetch Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_IFETCH) if (SQ_IFETCH != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_IFETCH) if (SQ_IFETCH != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_IFETCH) if (SQ_IFETCH != 0)
-              else None))
-            unit: Cycles
-            tips: SQ_IFETCH_LEVEL
-          LDS Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            unit: Cycles
-            tips: SQ_INST_LEVEL_LDS
-          
\ No newline at end of file
+          INT8 OPs:
+            avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
+            min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
+            max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
+            unit: (OPs  + $normUnit)
+            tips: 
\ No newline at end of file
diff --git a/src/omniperf_analyze/configs/gfx90a/1200_lds.yaml b/src/omniperf_analyze/configs/gfx90a/1200_lds.yaml
index 3fd52c3b1..6af1641d1 100644
--- a/src/omniperf_analyze/configs/gfx90a/1200_lds.yaml
+++ b/src/omniperf_analyze/configs/gfx90a/1200_lds.yaml
@@ -13,7 +13,7 @@ Panel Config:
         title: Speed-of-Light
         header:
           metric: Metric
-          value: Value
+          value: Avg
           unit: Unit
           tips: Tips
         style:
@@ -26,20 +26,24 @@ Panel Config:
             value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: Pct of Peak
             tips: 
+            unit: pct
           Access Rate:
             value: AVG(((200 * SQ_ACTIVE_INST_LDS) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: Pct of Peak
             tips: 
-          Bandwidth (Pct-of-Peak):
+            unit: pct
+          Theoretical Bandwidth (% of Peak):
             value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks))
               / (EndNs - BeginNs)) / (($sclk * $numCU) * 0.00128)))
             unit: Pct of Peak
             tips: 
+            unit: pct
           Bank Conflict Rate:
             value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
               if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
             unit: Pct of Peak
             tips:
+            unit: pct
 
     - metric_table:
         id: 1202
@@ -58,7 +62,7 @@ Panel Config:
             max: MAX((SQ_INSTS_LDS / $denom))
             unit: (Instr  + $normUnit)
             tips: 
-          Bandwidth:
+          Theoretical Bandwidth:
             avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks))
               / $denom))
             min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($LDSBanks))
@@ -67,7 +71,14 @@ Panel Config:
               / $denom))
             unit: (Bytes  + $normUnit)
             tips: 
-          Bank Conficts/Access:
+          LDS Latency:
+            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
+            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
+            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
+            unit: Cycles
+            coll_level: SQ_INST_LEVEL_LDS
+            tips: 
+          Bank Conflicts/Access:
             avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
               if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
             min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
@@ -82,7 +93,7 @@ Panel Config:
             max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          Atomic Cycles:
+          Atomic Return Cycles:
             avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
             min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
             max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
@@ -110,12 +121,5 @@ Panel Config:
             avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
             min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
             max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
-            unit: ( + $normUnit)
-            tips: 
-          LDS Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_LDS
-            tips: 
+            unit: (Accesses + $normUnit)
+            tips: 
\ No newline at end of file
diff --git a/src/omniperf_analyze/configs/gfx90a/1300_instruction-cache.yaml b/src/omniperf_analyze/configs/gfx90a/1300_instruction-cache.yaml
index 329a7edba..98a38e2c2 100644
--- a/src/omniperf_analyze/configs/gfx90a/1300_instruction-cache.yaml
+++ b/src/omniperf_analyze/configs/gfx90a/1300_instruction-cache.yaml
@@ -13,7 +13,7 @@ Panel Config:
         title: Speed-of-Light
         header:
           metric: Metric
-          value: Value
+          value: Avg
           unit: Unit
           tips: Tips
         style:
@@ -27,11 +27,16 @@ Panel Config:
               * (EndNs - BeginNs))))
             unit: Pct of Peak
             tips: 
-          Cache Hit:
+          Cache Hit Rate:
             value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
               + SQC_ICACHE_MISSES_DUPLICATE)))
             unit: Pct of Peak
             tips: 
+          L1I-L2 Bandwidth:
+            value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($sclk * $numSQC) 
+              * (EndNs - BeginNs))))
+            unit: Pct of Peak
+            tips:
 
     - metric_table:
         id: 1302
@@ -68,7 +73,7 @@ Panel Config:
             max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
             unit: (Misses  + $normUnit)
             tips: 
-          Cache Hit:
+          Cache Hit Rate:
             avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
               + SQC_ICACHE_MISSES_DUPLICATE)))
             min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) +
@@ -77,3 +82,27 @@ Panel Config:
               SQC_ICACHE_MISSES_DUPLICATE)))
             unit: pct
             tips: 
+          Instruction Fetch Latency:
+            avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
+            min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
+            max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
+            unit: Cycles
+            coll_level: SQ_IFETCH_LEVEL
+            tips:
+    - metric_table:
+        id: 1303
+        title: Instruction Cache - L2 Interface
+        header:
+          metric: Metric
+          mean: Mean
+          min: Min
+          max: Max
+          unit: Unit
+          tips: Tips
+        metric:
+          L1I-L2 Bandwidth:
+            mean: AVG(((SQC_TC_INST_REQ * 64) / $denom))
+            min: MIN(((SQC_TC_INST_REQ * 64) / $denom))
+            max: MAX(((SQC_TC_INST_REQ * 64) / $denom))
+            unit: (Bytes + $normUnit)
+            tips:
\ No newline at end of file
diff --git a/src/omniperf_analyze/configs/gfx90a/1400_constant-cache.yaml b/src/omniperf_analyze/configs/gfx90a/1400_constant-cache.yaml
index 563caad13..aa55fee0c 100644
--- a/src/omniperf_analyze/configs/gfx90a/1400_constant-cache.yaml
+++ b/src/omniperf_analyze/configs/gfx90a/1400_constant-cache.yaml
@@ -12,8 +12,8 @@ Panel Config:
         id: 1401
         title: Speed-of-Light
         header:
-          mertic: Metric
-          value: Value
+          metric: Metric
+          value: Avg
           unit: Unit
           tips: Tips
         style:
@@ -27,12 +27,17 @@ Panel Config:
               * (EndNs - BeginNs))))
             unit: Pct of Peak
             tips:
-          Cache Hit:
+          Cache Hit Rate:
             value:
               AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE))
               if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
             unit: Pct of Peak
             tips:
+          sL1D-L2 BW:
+            value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000)
+                        / (2 * ($sclk * $numSQC) * (EndNs - BeginNs)))
+            unit: Pct of Peak
+            tips:
 
     - metric_table:
         id: 1402
@@ -69,7 +74,7 @@ Panel Config:
             max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
             unit: (Req  + $normUnit)
             tips: 
-          Cache Hit:
+          Cache Hit Rate:
             avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
               + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
               + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
@@ -138,6 +143,12 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
+          sL1D-L2 BW:
+            mean: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom))
+            min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom))
+            max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom))
+            unit: (Bytes + $normUnit)
+            tips:
           Read Req:
             avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
             min: MIN((SQC_TC_DATA_READ_REQ / $denom))
@@ -156,7 +167,7 @@ Panel Config:
             max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
             unit: (Req  + $normUnit)
             tips: 
-          Stall:
+          Stall Cycles:
             avg: AVG((SQC_TC_STALL / $denom))
             min: MIN((SQC_TC_STALL / $denom))
             max: MAX((SQC_TC_STALL / $denom))
diff --git a/src/omniperf_analyze/configs/gfx90a/1500_TA_and_TD.yaml b/src/omniperf_analyze/configs/gfx90a/1500_TA_and_TD.yaml
index 03af85497..5f7d73df8 100644
--- a/src/omniperf_analyze/configs/gfx90a/1500_TA_and_TD.yaml
+++ b/src/omniperf_analyze/configs/gfx90a/1500_TA_and_TD.yaml
@@ -6,11 +6,11 @@ Metric Description:
 # Define the panel properties and properties of each metric in the panel.
 Panel Config:
   id: 1500
-  title: Texture Addresser and Texture Data (TA/TD)
+  title: Address Processing Unit and Data Return Path (TA/TD)
   data source:
     - metric_table:
         id: 1501
-        title: TA
+        title: Address Processing Unit
         header:
           metric: Metric
           avg: Avg
@@ -19,25 +19,25 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          TA Busy:
+          Address Processing Unit Busy:
             avg: AVG(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TA_TA_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             tips: 
-          TC2TA Addr Stall:
+          Address Stall:
             avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             tips: 
-          TC2TA Data Stall:
+          Data Stall:
             avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             tips: 
-          TD2TA Addr Stall:
+          Data-Processor → Address Stall:
             avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / (GRBM_GUI_ACTIVE * $numCU)))
@@ -47,69 +47,69 @@ Panel Config:
             avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
             min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
             max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Flat Instr:
+          Global/Generic Instructions:
             avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Flat Read Instr:
+          Global/Generic Read Instructions:
             avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Flat Write Instr:
+          Global/Generic Write Instructions:
             avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Flat Atomic Instr:
+          Global/Generic Atomic Instructions:
             avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Buffer Instr:
+          Spill/Stack Instructions:
             avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Buffer Read Instr:
+          Spill/Stack Read Instructions:
             avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Buffer Write Instr:
+          Spill/Stack Write Instructions:
             avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Buffer Atomic Instr:
+          Spill/Stack Atomic Instructions:
             avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Buffer Total Cylces:
+          Spill/Stack Total Cycles:
             avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          Buffer Coalesced Read:
+          Spill/Stack Coalesced Read:
             avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          Buffer Coalesced Write:
+          Spill/Stack Coalesced Write:
             avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
@@ -127,48 +127,48 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          TD Busy:
+          Data-Return Busy:
             avg: AVG(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TD_TD_BUSY_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             tips: 
-          TC2TD Stall:
+          Cache RAM → Data-Return Stall:
             avg: AVG(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TD_TC_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             tips: 
-          SPI2TD Stall:
+          Workgroup manager → Data-Return Stall:
             avg: AVG(((100 * TD_SPI_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             min: MIN(((100 * TD_SPI_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             max: MAX(((100 * TD_SPI_STALL_sum) / (GRBM_GUI_ACTIVE * $numCU)))
             unit: pct
             tips: 
-          Coalescable Instr:
+          Coalescable Instructions:
             avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Load Instr:
+          Read Instructions:
             avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
             min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
             max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Store Instr:
+          Write Instructions:
             avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
             min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
             max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
-          Atomic Instr:
+          Atomic Instructions:
             avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
             min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
             max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instructions  + $normUnit)
             tips: 
diff --git a/src/omniperf_analyze/configs/gfx90a/1600_L1_cache.yaml b/src/omniperf_analyze/configs/gfx90a/1600_L1_cache.yaml
index d9291de21..559bbdcb9 100644
--- a/src/omniperf_analyze/configs/gfx90a/1600_L1_cache.yaml
+++ b/src/omniperf_analyze/configs/gfx90a/1600_L1_cache.yaml
@@ -13,7 +13,7 @@ Panel Config:
         title: Speed-of-Light
         header:
           metric: Metric
-          value: Value
+          value: Avg
           unit: Unit
           tips: Tips
         style:
@@ -22,26 +22,26 @@ Panel Config:
           label_txt: (%)
           xrange: [0, 110]
         metric:
-          Buffer Coalescing:
-            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
-              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
-            unit: Pct of Peak
-            tips: 
-          Cache Util:
-            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None))
+          Hit rate:
+            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
+              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
+              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
+              None))
             unit: Pct of Peak
             tips: 
-          Cache BW:
+          Bandwidth:
             value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs))))
               / ((($sclk / 1000) * 64) * $numCU))
             unit: Pct of Peak
             tips: 
-          Cache Hit:
-            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
-               None))
+          Utilization:
+            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
+              != 0) else None))
+            unit: Pct of Peak
+            tips: 
+          Coalescing:
+            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
+              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
             unit: Pct of Peak
             tips: 
 
@@ -141,11 +141,26 @@ Panel Config:
             unit: (Req  + $normUnit)
             tips:
           Cache BW:
-            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs)))
-            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs)))
-            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (EndNs - BeginNs)))
-            unit: GB/s
+            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom))
+            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom))
+            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom))
+            unit: (Bytes + $normUnit)
             tips:
+          Cache Hit Rate:
+            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
+              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
+              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
+               None))
+            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
+              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
+              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
+               None))
+            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
+              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
+              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
+               None))
+            unit: pct
+            tips: 
           Cache Accesses:
             avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
             min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
@@ -164,22 +179,7 @@ Panel Config:
               / $denom))
             unit: (Req  + $normUnit)
             tips: 
-          Cache Hit Rate:
-            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
-              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
-               None))
-            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
-              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
-               None))
-            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
-              TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
-               None))
-            unit: pct
-            tips: 
-          Invalidate:
+          Invalidations:
             avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
             min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
             max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
@@ -188,9 +188,9 @@ Panel Config:
           L1-L2 BW:
             avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
               + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom))
-            min: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
+            min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
               + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom))
-            max: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
+            max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
               + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom))
             unit: (Bytes + $normUnit)
             tips:
@@ -388,17 +388,17 @@ Panel Config:
             avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
             min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
             max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            units: (Hits + $normUnit)
+            units: (Req + $normUnit)
             tips: 
-          Misses (Translation):
+          Translation Misses:
             avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
             min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
             max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            units: (Misses + $normUnit)
+            units: (Req + $normUnit)
             tips: 
-          Misses (Permission):
+          Permission Misses:
             avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
             min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
             max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            units: (Misses + $normUnit)
+            units: (Req + $normUnit)
             tips: 
diff --git a/src/omniperf_analyze/configs/gfx90a/1700_L2_cache.yaml b/src/omniperf_analyze/configs/gfx90a/1700_L2_cache.yaml
index ddbaf9155..b2e8c6946 100644
--- a/src/omniperf_analyze/configs/gfx90a/1700_L2_cache.yaml
+++ b/src/omniperf_analyze/configs/gfx90a/1700_L2_cache.yaml
@@ -13,31 +13,35 @@ Panel Config:
         title: Speed-of-Light
         header:
           metric: Metric
-          value: Value
+          value: Avg
           unit: Unit
           tips: Tips
         style:
           type: simple_bar
         metric:
-          L2 Util:
+          Utilization:
             value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($L2Banks) * GRBM_GUI_ACTIVE)))
             unit: pct
+            tips:
+          Bandwidth:
+            value: ((100 * AVG(((TCC_REQ_sum * 128) / (EndNs - BeginNs)))) / ((($sclk / 1000) * 128) * TO_INT($L2Banks)))
+            unit: pct
             tips: 
-          Cache Hit:
+          Hit Rate:
             value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
               + TCC_MISS_sum) != 0) else 0))
             unit: pct
-            tips: 
-          L2-EA Rd BW:
+            tips:
+          L2-Fabric Read BW:
             value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
               * 64)) / (EndNs - BeginNs)))
             unit: GB/s
-            tips: 
-          L2-EA Wr BW:
+            tips:
+          L2-Fabric Write and Atomic BW:
             value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
               * 32)) / (EndNs - BeginNs)))
             unit: GB/s
-            tips: 
+            tips:
 
     - metric_table:
         id: 1702
@@ -50,7 +54,7 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
-          Read BW:
+          L2-Fabric Read BW:
             avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
               * 64)) / $denom))
             min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
@@ -58,8 +62,26 @@ Panel Config:
             max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
               * 64)) / $denom))
             unit: (Bytes  + $normUnit)
-            tips: 
-          Write BW:
+            tips:
+          HBM Read Traffic:
+            avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          Remote Read Traffic:
+            avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          Uncached Read Traffic:
+            avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          L2-Fabric Write and Atomic BW:
             avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
               * 32)) / $denom))
             min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
@@ -67,55 +89,31 @@ Panel Config:
             max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
               * 32)) / $denom))
             unit: (Bytes  + $normUnit)
-            tips: 
-          Read (32B):
-            avg: AVG((TCC_EA_RDREQ_32B_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_32B_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_32B_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Read (Uncached 32B):
-            avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Read (64B):
-            avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-            min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-            max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          HBM Read:
-            avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Write (32B):
-            avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
-            min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
-            max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Write (Uncached 32B):
-            avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Write (64B):
-            avg: AVG((TCC_EA_WRREQ_64B_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_64B_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_64B_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          HBM Write:
-            avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
+            tips:
+          HBM Write and Atomic Traffic:
+            avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          Remote Write and Atomic Traffic:
+            avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          Atomic Traffic:
+            avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            unit: pct
+            tips:
+          Uncached Write and Atomic Traffic:
+            avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None))
+            unit: pct
+            tips:
           Read Latency:
             avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum !=
               0) else None))
@@ -124,7 +122,7 @@ Panel Config:
             max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum !=
               0) else None))
             unit: Cycles
-            tips: 
+            tips:
           Write Latency:
             avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum !=
               0) else None))
@@ -133,7 +131,7 @@ Panel Config:
             max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum !=
               0) else None))
             unit: Cycles
-            tips: 
+            tips:
           Atomic Latency:
             avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
               != 0) else None))
@@ -142,7 +140,7 @@ Panel Config:
             max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
               != 0) else None))
             unit: Cycles
-            tips: 
+            tips:
           Read Stall:
             avg: AVG((((100 * ((TCC_EA_RDREQ_IO_CREDIT_STALL_sum + TCC_EA_RDREQ_GMI_CREDIT_STALL_sum)
               + TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum !=
@@ -154,7 +152,7 @@ Panel Config:
               + TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum !=
               0) else None))
             unit: pct
-            tips: 
+            tips:
           Write Stall:
             avg: AVG((((100 * ((TCC_EA_WRREQ_IO_CREDIT_STALL_sum + TCC_EA_WRREQ_GMI_CREDIT_STALL_sum)
               + TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum !=
@@ -166,7 +164,7 @@ Panel Config:
               + TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum !=
               0) else None))
             unit: pct
-            tips: 
+            tips:
 
     - metric_table:
         id: 1703
@@ -179,117 +177,123 @@ Panel Config:
           unit: Unit
           tips: Tips
         metric:
+          Bandwidth:
+            avg: AVG((TCC_REQ_sum * 128) / $denom)
+            min: MIN((TCC_REQ_sum * 128) / $denom)
+            max: MAX((TCC_REQ_sum * 128) / $denom)
+            unit: (Bytes + $normUnit)
+            tips: 
           Req:
             avg: AVG((TCC_REQ_sum / $denom))
             min: MIN((TCC_REQ_sum / $denom))
             max: MAX((TCC_REQ_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
-          Streaming Req:
-            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
-            min: MIN((TCC_STREAMING_REQ_sum / $denom))
-            max: MAX((TCC_STREAMING_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
+            tips:
           Read Req:
             avg: AVG((TCC_READ_sum / $denom))
             min: MIN((TCC_READ_sum / $denom))
             max: MAX((TCC_READ_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
           Write Req:
             avg: AVG((TCC_WRITE_sum / $denom))
             min: MIN((TCC_WRITE_sum / $denom))
             max: MAX((TCC_WRITE_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
           Atomic Req:
             avg: AVG((TCC_ATOMIC_sum / $denom))
             min: MIN((TCC_ATOMIC_sum / $denom))
             max: MAX((TCC_ATOMIC_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
+          Streaming Req:
+            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
+            min: MIN((TCC_STREAMING_REQ_sum / $denom))
+            max: MAX((TCC_STREAMING_REQ_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
           Probe Req:
             avg: AVG((TCC_PROBE_sum / $denom))
             min: MIN((TCC_PROBE_sum / $denom))
             max: MAX((TCC_PROBE_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
+          Cache Hit:
+            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+              + TCC_MISS_sum) != 0) else None))
+            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+              + TCC_MISS_sum) != 0) else None))
+            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+              + TCC_MISS_sum) != 0) else None))
+            unit: pct
+            tips:
           Hits:
             avg: AVG((TCC_HIT_sum / $denom))
             min: MIN((TCC_HIT_sum / $denom))
             max: MAX((TCC_HIT_sum / $denom))
             unit: (Hits  + $normUnit)
-            tips: 
+            tips:
           Misses:
             avg: AVG((TCC_MISS_sum / $denom))
             min: MIN((TCC_MISS_sum / $denom))
             max: MAX((TCC_MISS_sum / $denom))
             unit: (Misses  + $normUnit)
-            tips: 
-          Cache Hit:
-            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            unit: pct
-            tips: 
+            tips:
           Writeback:
             avg: AVG((TCC_WRITEBACK_sum / $denom))
             min: MIN((TCC_WRITEBACK_sum / $denom))
             max: MAX((TCC_WRITEBACK_sum / $denom))
-            unit: ( + $normUnit)
-            tips: 
+            unit: (Cachelines  + $normUnit)
+            tips:
+          Writeback (Internal):
+            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
+            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
+            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
+            unit: (Cachelines + $normUnit)
+            tips:
+          Writeback (vL1D Req):
+            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
+            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
+            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
+            unit: (Cachelines + $normUnit)
+            tips:
+          Evict (Internal):
+            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
+            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
+            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
+            unit: (Cachelines + $normUnit)
+            tips:
+          Evict (vL1D Req):
+            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
+            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
+            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
+            unit: (Cachelines + $normUnit)
+            tips:
           NC Req:
             avg: AVG((TCC_NC_REQ_sum / $denom))
             min: MIN((TCC_NC_REQ_sum / $denom))
             max: MAX((TCC_NC_REQ_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
           UC Req:
             avg: AVG((TCC_UC_REQ_sum / $denom))
             min: MIN((TCC_UC_REQ_sum / $denom))
             max: MAX((TCC_UC_REQ_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
           CC Req:
             avg: AVG((TCC_CC_REQ_sum / $denom))
             min: MIN((TCC_CC_REQ_sum / $denom))
             max: MAX((TCC_CC_REQ_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
           RW Req:
             avg: AVG((TCC_RW_REQ_sum / $denom))
             min: MIN((TCC_RW_REQ_sum / $denom))
             max: MAX((TCC_RW_REQ_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
-          Writeback (Normal):
-            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
-            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
-            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
-            unit: ( + $normUnit)
-            tips: 
-          Writeback (TC Req):
-            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            unit: ( + $normUnit)
-            tips: 
-          Evict (Normal):
-            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
-            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
-            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
-            unit: ( + $normUnit)
-            tips: 
-          Evict (TC Req):
-            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            unit: ( + $normUnit)
-            tips: 
+            tips:
 
     - metric_table:
         id: 1704
@@ -306,59 +310,137 @@ Panel Config:
         style:
           type: simple_multi_bar
         metric:
-          Read - Remote Socket Stall:
-            type: Remote Socket Stall
+          Read - PCIe Stall:
+            type: PCIe Stall
             transaction: Read
-            avg: AVG((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_IO_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Read - Peer GCD Stall:
-            type: Peer GCD Stall
+            avg: AVG(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
+          Read - Infinity Fabric™ Stall:
+            type: Infinity Fabric™ Stall
             transaction: Read
-            avg: AVG((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
+            avg: AVG(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
           Read - HBM Stall:
             type: HBM Stall
             transaction: Read
-            avg: AVG((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Write - Remote Socket Stall:
-            type: Remote Socket Stall
+            avg: AVG(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
+          Write - PCIe Stall:
+            type: PCIe Stall
             transaction: Write
-            avg: AVG((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_IO_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
-          Write - Peer GCD Stall:
-            type: Peer GCD Stall
+            avg: AVG(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
+          Write - Infinity Fabric™ Stall:
+            type: Infinity Fabric™ Stall
             transaction: Write
-            avg: AVG((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
+            avg: AVG(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
           Write - HBM Stall:
             type: HBM Stall
             transaction: Write
-            avg: AVG((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / $denom))
-            unit: (Req  + $normUnit)
-            tips: 
+            avg: AVG(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
           Write - Credit Starvation:
             type: Credit Starvation
             transaction: Write
-            avg: AVG((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom))
-            min: MIN((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom))
-            max: MAX((TCC_TOO_MANY_EA_WRREQS_STALL_sum / $denom))
+            avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+            unit: pct
+            tips:
+
+    - metric_table:
+        id: 1705
+        title: L2 - Fabric Detailed Transaction Breakdown
+        header:
+          metric: Metric
+          avg: Avg
+          min: Min
+          max: Max
+          unit: Unit
+          tips: Tips
+        metric:
+          Read (32B):
+            avg: AVG((TCC_EA_RDREQ_32B_sum / $denom))
+            min: MIN((TCC_EA_RDREQ_32B_sum / $denom))
+            max: MAX((TCC_EA_RDREQ_32B_sum / $denom))
             unit: (Req  + $normUnit)
-            tips: 
+            tips:
+          Read (Uncached):
+            avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom))
+            min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom))
+            max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Read (64B):
+            avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
+            min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
+            max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          HBM Read:
+            avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom))
+            min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom))
+            max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Remote Read:
+            avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
+            min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
+            max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Write and Atomic (32B):
+            avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
+            min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
+            max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Write and Atomic (Uncached):
+            avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom))
+            min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom))
+            max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Write and Atomic (64B):
+            avg: AVG((TCC_EA_WRREQ_64B_sum / $denom))
+            min: MIN((TCC_EA_WRREQ_64B_sum / $denom))
+            max: MAX((TCC_EA_WRREQ_64B_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          HBM Write and Atomic:
+            avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom))
+            min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom))
+            max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Remote Write and Atomic:
+            avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
+            min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
+            max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
+            unit: (Req  + $normUnit)
+            tips:
+          Atomic:
+            avg: AVG((TCC_EA_ATOMIC_sum / $denom))
+            min: MIN((TCC_EA_ATOMIC_sum / $denom))
+            max: MAX((TCC_EA_ATOMIC_sum / $denom))
+            unit: (Req  + $normUnit)
+            tips:
\ No newline at end of file
diff --git a/src/omniperf_analyze/configs/gfx90a/1800_L2_cache_per_channel.yaml b/src/omniperf_analyze/configs/gfx90a/1800_L2_cache_per_channel.yaml
index c6d93aa61..42d3014b1 100644
--- a/src/omniperf_analyze/configs/gfx90a/1800_L2_cache_per_channel.yaml
+++ b/src/omniperf_analyze/configs/gfx90a/1800_L2_cache_per_channel.yaml
@@ -167,7 +167,7 @@ Panel Config:
               + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
             unit: pct
             tips: 
-          Req:
+          L2 Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_REQ[0]) + TO_INT(TCC_REQ[1]))
               + TO_INT(TCC_REQ[2])) + TO_INT(TCC_REQ[3])) + TO_INT(TCC_REQ[4])) + TO_INT(TCC_REQ[5]))
               + TO_INT(TCC_REQ[6])) + TO_INT(TCC_REQ[7])) + TO_INT(TCC_REQ[8])) + TO_INT(TCC_REQ[9]))
@@ -206,7 +206,7 @@ Panel Config:
               + TO_INT(TCC_REQ[30])) + TO_INT(TCC_REQ[31])) / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L1 - L2 Read Req:
+          L2 Read Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_READ[0]) + TO_INT(TCC_READ[1]))
               + TO_INT(TCC_READ[2])) + TO_INT(TCC_READ[3])) + TO_INT(TCC_READ[4])) + TO_INT(TCC_READ[5]))
               + TO_INT(TCC_READ[6])) + TO_INT(TCC_READ[7])) + TO_INT(TCC_READ[8])) + TO_INT(TCC_READ[9]))
@@ -249,7 +249,7 @@ Panel Config:
               + TO_INT(TCC_READ[31])) / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L1 - L2 Write Req:
+          L2 Write Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_WRITE[0]) + TO_INT(TCC_WRITE[1]))
               + TO_INT(TCC_WRITE[2])) + TO_INT(TCC_WRITE[3])) + TO_INT(TCC_WRITE[4])) +
               TO_INT(TCC_WRITE[5])) + TO_INT(TCC_WRITE[6])) + TO_INT(TCC_WRITE[7])) + TO_INT(TCC_WRITE[8]))
@@ -296,7 +296,7 @@ Panel Config:
               + TO_INT(TCC_WRITE[30])) + TO_INT(TCC_WRITE[31])) / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L1 - L2 Atomic Req:
+          L2 Atomic Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_ATOMIC[0]) + TO_INT(TCC_ATOMIC[1]))
               + TO_INT(TCC_ATOMIC[2])) + TO_INT(TCC_ATOMIC[3])) + TO_INT(TCC_ATOMIC[4]))
               + TO_INT(TCC_ATOMIC[5])) + TO_INT(TCC_ATOMIC[6])) + TO_INT(TCC_ATOMIC[7]))
@@ -347,7 +347,7 @@ Panel Config:
               / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L2 - EA Read Req:
+          L2 - Fabric Read Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ[0]) + TO_INT(TCC_EA_RDREQ[1]))
               + TO_INT(TCC_EA_RDREQ[2])) + TO_INT(TCC_EA_RDREQ[3])) + TO_INT(TCC_EA_RDREQ[4]))
               + TO_INT(TCC_EA_RDREQ[5])) + TO_INT(TCC_EA_RDREQ[6])) + TO_INT(TCC_EA_RDREQ[7]))
@@ -398,7 +398,7 @@ Panel Config:
               / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L2 - EA Write Req:
+          L2 - Fabric Write and Atomic Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ[0]) + TO_INT(TCC_EA_WRREQ[1]))
               + TO_INT(TCC_EA_WRREQ[2])) + TO_INT(TCC_EA_WRREQ[3])) + TO_INT(TCC_EA_WRREQ[4]))
               + TO_INT(TCC_EA_WRREQ[5])) + TO_INT(TCC_EA_WRREQ[6])) + TO_INT(TCC_EA_WRREQ[7]))
@@ -449,7 +449,7 @@ Panel Config:
               / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L2 - EA Atomic Req:
+          L2 - Fabric Atomic Req:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_ATOMIC[0]) + TO_INT(TCC_EA_ATOMIC[1]))
               + TO_INT(TCC_EA_ATOMIC[2])) + TO_INT(TCC_EA_ATOMIC[3])) + TO_INT(TCC_EA_ATOMIC[4]))
               + TO_INT(TCC_EA_ATOMIC[5])) + TO_INT(TCC_EA_ATOMIC[6])) + TO_INT(TCC_EA_ATOMIC[7]))
@@ -500,7 +500,7 @@ Panel Config:
               / 32) / $denom))
             unit: (Req + $normUnit)
             tips: 
-          L2 - EA Read Lat:
+          L2 - Fabric Read Lat:
             avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_RDREQ_LEVEL[0] + TCC_EA_RDREQ_LEVEL[1])
               + TCC_EA_RDREQ_LEVEL[2]) + TCC_EA_RDREQ_LEVEL[3]) + TCC_EA_RDREQ_LEVEL[4])
               + TCC_EA_RDREQ_LEVEL[5]) + TCC_EA_RDREQ_LEVEL[6]) + TCC_EA_RDREQ_LEVEL[7])
@@ -615,7 +615,7 @@ Panel Config:
               + TCC_EA_RDREQ[29]) + TCC_EA_RDREQ[30]) + TCC_EA_RDREQ[31]) != 0) else None))
             unit: Cycles
             tips: 
-          L2 - EA Write Lat:
+          L2 - Fabric Write Lat:
             avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_WRREQ_LEVEL[0] + TCC_EA_WRREQ_LEVEL[1])
               + TCC_EA_WRREQ_LEVEL[2]) + TCC_EA_WRREQ_LEVEL[3]) + TCC_EA_WRREQ_LEVEL[4])
               + TCC_EA_WRREQ_LEVEL[5]) + TCC_EA_WRREQ_LEVEL[6]) + TCC_EA_WRREQ_LEVEL[7])
@@ -730,7 +730,7 @@ Panel Config:
               + TCC_EA_WRREQ[29]) + TCC_EA_WRREQ[30]) + TCC_EA_WRREQ[31]) != 0) else None))
             unit: Cycles
             tips: 
-          L2 - EA Atomic Lat:
+          L2 - Fabric Atomic Lat:
             avg: AVG((((((((((((((((((((((((((((((((((TCC_EA_ATOMIC_LEVEL[0] + TCC_EA_ATOMIC_LEVEL[1])
               + TCC_EA_ATOMIC_LEVEL[2]) + TCC_EA_ATOMIC_LEVEL[3]) + TCC_EA_ATOMIC_LEVEL[4])
               + TCC_EA_ATOMIC_LEVEL[5]) + TCC_EA_ATOMIC_LEVEL[6]) + TCC_EA_ATOMIC_LEVEL[7])
@@ -849,7 +849,7 @@ Panel Config:
               None))
             unit: Cycles
             tips: 
-          L2 - EA Read Stall (IO):
+          L2 - Fabric Read Stall (PCIe):
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[0])
               + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[1])) + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[2]))
               + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[3])) + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[4]))
@@ -920,7 +920,7 @@ Panel Config:
               + TO_INT(TCC_EA_RDREQ_IO_CREDIT_STALL[31])) / 32) / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Read Stall (GMI):
+          L2 - Fabric Read Stall (Infinity Fabric™):
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[0])
               + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[1])) + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[2]))
               + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[3])) + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[4]))
@@ -991,7 +991,7 @@ Panel Config:
               + TO_INT(TCC_EA_RDREQ_GMI_CREDIT_STALL[31])) / 32) / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Read Stall (DRAM):
+          L2 - Fabric Read Stall (HBM):
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[0])
               + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[1])) + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[2]))
               + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[3])) + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[4]))
@@ -1062,7 +1062,7 @@ Panel Config:
               + TO_INT(TCC_EA_RDREQ_DRAM_CREDIT_STALL[31])) / 32) / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Write Stall (IO):
+          L2 - Fabric Write Stall (PCIe):
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[0])
               + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[1])) + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[2]))
               + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[3])) + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[4]))
@@ -1133,7 +1133,7 @@ Panel Config:
               + TO_INT(TCC_EA_WRREQ_IO_CREDIT_STALL[31])) / 32) / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Write Stall (GMI):
+          L2 - Fabric Write Stall (Infinity Fabric™):
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[0])
               + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[1])) + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[2]))
               + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[3])) + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[4]))
@@ -1204,7 +1204,7 @@ Panel Config:
               + TO_INT(TCC_EA_WRREQ_GMI_CREDIT_STALL[31])) / 32) / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Write Stall (DRAM):
+          L2 - Fabric Write Stall (HBM):
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[0])
               + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[1])) + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[2]))
               + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[3])) + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[4]))
@@ -1275,7 +1275,7 @@ Panel Config:
               + TO_INT(TCC_EA_WRREQ_DRAM_CREDIT_STALL[31])) / 32) / $denom))
             unit: (Cycles  + $normUnit)
             tips: 
-          L2 - EA Write Starve:
+          L2 - Fabric Write Starve:
             avg: AVG((((((((((((((((((((((((((((((((((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[0])
               + TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[1])) + TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[2]))
               + TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[3])) + TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[4]))
@@ -1354,22 +1354,22 @@ Panel Config:
           channel: Channel
           hit rate: L2 Cache Hit Rate (%)
           req: Requests (Requests)
-          read req: L1-L2 Read (Requests)
-          write req: L1-L2 Write (Requests)
-          atomic req: L1-L2 Atomic (Requests)
-          ea read req: L2-EA Read (Requests)
-          ea write req: L2-EA Write (Requests)
-          ea atomic req: L2-EA Atomic (Requests)
-          ea read lat - cycles: L2-EA Read Latency (Cycles)
-          ea write lat - cycles: L2-EA Write Latency (Cycles)
-          ea atomic lat - cycles: L2-EA Atomic Latency (Cycles)
-          ea read stall - io: L2-EA Read Stall - IO (Cycles per)
-          ea read stall - gmi: L2-EA Read Stall - GMI (Cycles per)
-          ea read stall - dram: L2-EA Read Stall - DRAM (Cycles per)
-          ea write stall - io: L2-EA Write Stall - IO (Cycles per)
-          ea write stall - gmi: L2-EA Write Stall - GMI (Cycles per)
-          ea write stall - dram: L2-EA Write Stall - DRAM (Cycles per)
-          ea write stall - starve: L2-EA Write Stall - Starve (Cycles per)
+          read req: L2 Read (Requests)
+          write req: L2 Write (Requests)
+          atomic req: L2 Atomic (Requests)
+          ea read req: L2-Fabric Read (Requests)
+          ea write req: L2-Fabric Write and Atomic (Requests)
+          ea atomic req: L2-Fabric Atomic (Requests)
+          ea read lat - cycles: L2-Fabric Read Latency (Cycles)
+          ea write lat - cycles: L2-Fabric Write Latency (Cycles)
+          ea atomic lat - cycles: L2-Fabric Atomic Latency (Cycles)
+          ea read stall - io: L2-Fabric Read Stall - PCIe (Cycles)
+          ea read stall - gmi: L2-Fabric Read Stall - Infinity Fabric™ (Cycles)
+          ea read stall - dram: L2-Fabric Read Stall - HBM (Cycles)
+          ea write stall - io: L2-Fabric Write Stall - PCIe (Cycles)
+          ea write stall - gmi: L2-Fabric Write Stall - Infinity Fabric™ (Cycles)
+          ea write stall - dram: L2-Fabric Write Stall - HBM (Cycles)
+          ea write stall - starve: L2-Fabric Write Stall - Starve (Cycles)
           tips: Tips
         metric:
           '0':
@@ -1764,22 +1764,22 @@ Panel Config:
           channel: Channel
           hit rate: L2 Cache Hit Rate (%)
           req: Requests (Requests)
-          read req: L1-L2 Read (Requests)
-          write req: L1-L2 Write (Requests)
-          atomic req: L1-L2 Atomic (Requests)
-          ea read req: L2-EA Read (Requests)
-          ea write req: L2-EA Write (Requests)
-          ea atomic req: L2-EA Atomic (Requests)
-          ea read lat - cycles: L2-EA Read Latency (Cycles)
-          ea write lat - cycles: L2-EA Write Latency (Cycles)
-          ea atomic lat - cycles: L2-EA Atomic Latency (Cycles)
-          ea read stall - io: L2-EA Read Stall - IO (Cycles per)
-          ea read stall - gmi: L2-EA Read Stall - GMI (Cycles per)
-          ea read stall - dram: L2-EA Read Stall - DRAM (Cycles per)
-          ea write stall - io: L2-EA Write Stall - IO (Cycles per)
-          ea write stall - gmi: L2-EA Write Stall - GMI (Cycles per)
-          ea write stall - dram: L2-EA Write Stall - DRAM (Cycles per)
-          ea write stall - starve: L2-EA Write Stall - Starve (Cycles per)
+          read req: L2 Read (Requests)
+          write req: L2 Write (Requests)
+          atomic req: L2 Atomic (Requests)
+          ea read req: L2-Fabric Read (Requests)
+          ea write req: L2-Fabric Write and Atomic (Requests)
+          ea atomic req: L2-Fabric Atomic (Requests)
+          ea read lat - cycles: L2-Fabric Read Latency (Cycles)
+          ea write lat - cycles: L2-Fabric Write Latency (Cycles)
+          ea atomic lat - cycles: L2-Fabric Atomic Latency (Cycles)
+          ea read stall - io: L2-Fabric Read Stall - PCIe (Cycles)
+          ea read stall - gmi: L2-Fabric Read Stall - Infinity Fabric™ (Cycles)
+          ea read stall - dram: L2-Fabric Read Stall - HBM (Cycles)
+          ea write stall - io: L2-Fabric Write Stall - PCIe (Cycles)
+          ea write stall - gmi: L2-Fabric Write Stall - Infinity Fabric™ (Cycles)
+          ea write stall - dram: L2-Fabric Write Stall - HBM (Cycles)
+          ea write stall - starve: L2-Fabric Write Stall - Starve (Cycles)
           tips: Tips
         metric:     
           '16':
diff --git a/src/omniperf_analyze/configs/panel_config_template.yaml b/src/omniperf_analyze/configs/panel_config_template.yaml
index e241896b4..4b81bad0e 100644
--- a/src/omniperf_analyze/configs/panel_config_template.yaml
+++ b/src/omniperf_analyze/configs/panel_config_template.yaml
@@ -30,7 +30,7 @@ Panel Config:
           value: Value
           unit: Unit
           peak: Peak
-          pop: PoP
+          pop: Pct of Peak
           tips: Tips
         metric:
           METRIC01:
diff --git a/src/omniperf_analyze/utils/file_io.py b/src/omniperf_analyze/utils/file_io.py
index 60850b626..8f9b887d0 100644
--- a/src/omniperf_analyze/utils/file_io.py
+++ b/src/omniperf_analyze/utils/file_io.py
@@ -129,7 +129,7 @@ def create_df_kernel_top_stats(
         # NB: support ignoring the 1st n dispatched execution by '> n'
         #     The better way may be parsing python slice string
         if ">" in filter_dispatch_ids[0]:
-            m = re.match("\> (\d+)", filter_dispatch_ids[0])
+            m = re.match(r"\> (\d+)", filter_dispatch_ids[0])
             df = df[df["Index"] > int(m.group(1))]
         else:
             df = df.loc[df["Index"].astype(str).isin(filter_dispatch_ids)]
diff --git a/src/omniperf_analyze/utils/parser.py b/src/omniperf_analyze/utils/parser.py
index ceccb0746..5315d2ada 100644
--- a/src/omniperf_analyze/utils/parser.py
+++ b/src/omniperf_analyze/utils/parser.py
@@ -113,6 +113,11 @@ def to_min(*args):
 def to_max(*args):
     if len(args) == 1 and isinstance(args[0], pd.core.series.Series):
         return args[0].max()
+    elif len(args) == 2 and (
+        isinstance(args[0], pd.core.series.Series)
+        or isinstance(args[1], pd.core.series.Series)
+    ):
+        return np.maximum(args[0], args[1])
     elif max(args) == None:
         return np.nan
     else:
@@ -268,7 +273,7 @@ def build_eval_string(equation, coll_level):
     # build-in variable starts with '$', python can not handle it.
     # replace '$' with 'ammolite__'.
     # TODO: pre-check there is no "ammolite__" in all config files.
-    s = re.sub("\$", "ammolite__", s)
+    s = re.sub(r"\$", "ammolite__", s)
 
     # convert equation string to intermediate expression in df array format
     ast_node = ast.parse(s)
@@ -282,7 +287,7 @@ def build_eval_string(equation, coll_level):
     # the target is df['TCC_HIT[0]']
     s = re.sub(r"\'\]\[(\d+)\]", r"[\g<1>]']", s)
     # use .get() to catch any potential KeyErrors
-    s = re.sub("raw_pmc_df\['(.*?)']", r'raw_pmc_df.get("\1")', s)
+    s = re.sub(r"raw_pmc_df\['(.*?)']", r'raw_pmc_df.get("\1")', s)
     # apply coll_level
     s = re.sub(r"raw_pmc_df", "raw_pmc_df.get('" + coll_level + "')", s)
     # print("--- build_eval_string, return: ", s)
@@ -306,7 +311,7 @@ def update_denom_string(equation, unit):
 
 def update_normUnit_string(equation, unit):
     """
-    Update $normUnit in equation with runtime nomorlization unit.
+    Update $normUnit in equation with runtime normalization unit.
     It is string replacement for display only.
     """
 
@@ -315,8 +320,8 @@ def update_normUnit_string(equation, unit):
         return ""
 
     return re.sub(
-        "\((?P<PREFIX>\w*)\s+\+\s+(\$normUnit\))",
-        "\g<PREFIX> " + re.sub("_", " ", unit),
+        r"\((?P<PREFIX>\w*)\s+\+\s+(\$normUnit\))",
+        r"\g<PREFIX> " + re.sub("_", " ", unit),
         str(equation),
     ).capitalize()
 
@@ -564,9 +569,10 @@ def eval_metric(dfs, dfs_type, sys_info, soc_spec, raw_pmc_df, debug):
     # NB:
     #  Following with Omniperf 0.2.0, we are using HW spec from sys_info instead.
     #  The soc_spec is not in using right now, but can be used to do verification
-    #  aganist sys_info, forced theoretical evaluation, or supporting tool-chains
+    #  against sys_info, forced theoretical evaluation, or supporting tool-chains
     #  broken.
     ammolite__numSE = sys_info.numSE
+    ammolite__numPipes = soc_spec.numPipes
     ammolite__numCU = sys_info.numCU
     ammolite__numSIMD = sys_info.numSIMD
     ammolite__numWavesPerCU = sys_info.maxWavesPerCU  # todo: check do we still need it
@@ -612,7 +618,7 @@ def eval_metric(dfs, dfs_type, sys_info, soc_spec, raw_pmc_df, debug):
                                     print("~" * 40 + "\nExpression:")
                                     print(expr, "=", row[expr])
                                     print("Inputs:")
-                                    matched_vars = re.findall("ammolite__\w+", row[expr])
+                                    matched_vars = re.findall(r"ammolite__\w+", row[expr])
                                     if matched_vars:
                                         for v in matched_vars:
                                             print(
@@ -622,12 +628,12 @@ def eval_metric(dfs, dfs_type, sys_info, soc_spec, raw_pmc_df, debug):
                                                 eval(compile(v, "<string>", "eval")),
                                             )
                                     matched_cols = re.findall(
-                                        "raw_pmc_df\['\w+'\]\['\w+'\]", row[expr]
+                                        r"raw_pmc_df\['\w+'\]\['\w+'\]", row[expr]
                                     )
                                     if matched_cols:
                                         for c in matched_cols:
                                             m = re.match(
-                                                "raw_pmc_df\['(\w+)'\]\['(\w+)'\]", c
+                                                r"raw_pmc_df\['(\w+)'\]\['(\w+)'\]", c
                                             )
                                             t = raw_pmc_df[m.group(1)][
                                                 m.group(2)
@@ -651,7 +657,7 @@ def eval_metric(dfs, dfs_type, sys_info, soc_spec, raw_pmc_df, debug):
                                         print("~" * 40)
                                     except TypeError:
                                         print(
-                                            "skiping entry. Encounterd a missing counter"
+                                            "skipping entry. Encountered a missing counter"
                                         )
                                         print(expr, " has been assigned to None")
                                         print(np.nan)
@@ -661,7 +667,7 @@ def eval_metric(dfs, dfs_type, sys_info, soc_spec, raw_pmc_df, debug):
                                             == "'NoneType' object has no attribute 'get'"
                                         ):
                                             print(
-                                                "skiping entry. Encounterd a missing csv"
+                                                "skipping entry. Encountered a missing csv"
                                             )
                                             print(np.nan)
                                         else:
@@ -769,7 +775,7 @@ def apply_filters(workload, dir, is_gui, debug):
                 print("{} is an invalid dispatch id.".format(d))
                 sys.exit(1)
         if ">" in workload.filter_dispatch_ids[0]:
-            m = re.match("\> (\d+)", workload.filter_dispatch_ids[0])
+            m = re.match(r"\> (\d+)", workload.filter_dispatch_ids[0])
             ret_df = ret_df[
                 ret_df[schema.pmc_perf_file_prefix]["Index"] > int(m.group(1))
             ]
diff --git a/src/omniperf_analyze/utils/roofline_calc.py b/src/omniperf_analyze/utils/roofline_calc.py
index 275005233..ee00b2458 100644
--- a/src/omniperf_analyze/utils/roofline_calc.py
+++ b/src/omniperf_analyze/utils/roofline_calc.py
@@ -184,7 +184,7 @@ def plot_roof(roof_details, roof_data, mem_level, verbose):
 # -------------------------------------------------------------------------------------
 #                              Overlay application performance
 # -------------------------------------------------------------------------------------
-# Calculate relevent metrics for ai calculation
+# Calculate relevant metrics for ai calculation
 def plot_application(sortType, ret_df, verbose):
     df = ret_df["pmc_perf"]
     # Sort by top kernels or top dispatches?
diff --git a/src/omniperf_analyze/utils/schema.py b/src/omniperf_analyze/utils/schema.py
index f9b59868f..adc19a504 100644
--- a/src/omniperf_analyze/utils/schema.py
+++ b/src/omniperf_analyze/utils/schema.py
@@ -79,7 +79,7 @@ class Workload:
     "Min",
     "Max",
     "Avg",
-    "PoP",
+    "Pct of Peak",
     "Peak",
     "Count",
     "Mean",
@@ -91,22 +91,22 @@ class Workload:
     "Channel",
     "L2 Cache Hit Rate (%)",
     "Requests (Requests)",
-    "L1-L2 Read (Requests)",
-    "L1-L2 Write (Requests)",
-    "L1-L2 Atomic (Requests)",
-    "L2-EA Read (Requests)",
-    "L2-EA Write (Requests)",
-    "L2-EA Atomic (Requests)",
-    "L2-EA Read Latency (Cycles)",
-    "L2-EA Write Latency (Cycles)",
-    "L2-EA Atomic Latency (Cycles)",
-    "L2-EA Read Stall - IO (Cycles per)",
-    "L2-EA Read Stall - GMI (Cycles per)",
-    "L2-EA Read Stall - DRAM (Cycles per)",
-    "L2-EA Write Stall - IO (Cycles per)",
-    "L2-EA Write Stall - GMI (Cycles per)",
-    "L2-EA Write Stall - DRAM (Cycles per)",
-    "L2-EA Write Stall - Starve (Cycles per)",
+    "L2 Read (Requests)",
+    "L2 Write (Requests)",
+    "L2 Atomic (Requests)",
+    "L2-Fabric Read (Requests)",
+    "L2-Fabric Write and Atomic (Requests)",
+    "L2-Fabric Atomic (Requests)",
+    "L2-Fabric Read Latency (Cycles)",
+    "L2-Fabric Write Latency (Cycles)",
+    "L2-Fabric Atomic Latency (Cycles)",
+    "L2-Fabric Read Stall - PCIe (Cycles)",
+    "L2-Fabric Read Stall - Infinity Fabric™ (Cycles)",
+    "L2-Fabric Read Stall - HBM (Cycles)",
+    "L2-Fabric Write Stall - PCIe (Cycles)",
+    "L2-Fabric Write Stall - Infinity Fabric™ (Cycles)",
+    "L2-Fabric Write Stall - HBM (Cycles)",
+    "L2-Fabric Write Stall - Starve (Cycles)",
 ]
 
 # The prefix of raw pmc_perf.csv
diff --git a/src/perfmon_pub/mi100/pmc_tcc_perf.txt b/src/perfmon_pub/mi100/pmc_tcc_perf.txt
index 8a6d61de4..7aa7bef20 100644
--- a/src/perfmon_pub/mi100/pmc_tcc_perf.txt
+++ b/src/perfmon_pub/mi100/pmc_tcc_perf.txt
@@ -4,12 +4,12 @@ pmc: TCC_CYCLE_sum                          TCC_BUSY_sum
 pmc: TCC_NC_REQ_sum                         TCC_UC_REQ_sum                        TCC_CC_REQ_sum                         TCC_RW_REQ_sum
 pmc: TCC_REQ_sum                            TCC_STREAMING_REQ_sum                 TCC_HIT_sum                            TCC_MISS_sum
 pmc: TCC_READ_sum                           TCC_WRITE_sum                         TCC_ATOMIC_sum                         TCC_WRITEBACK_sum
-pmc: TCC_EA_WRREQ_sum                       TCC_EA_WRREQ_64B_sum                  TCC_EA_WR_UNCACHED_32B_sum 
+pmc: TCC_EA_WRREQ_sum                       TCC_EA_WRREQ_64B_sum                  TCC_EA_WR_UNCACHED_32B_sum             TCC_EA_WRREQ_DRAM_sum
 pmc: TCC_EA_WRREQ_STALL_sum                 TCC_EA_WRREQ_IO_CREDIT_STALL_sum      TCC_EA_WRREQ_GMI_CREDIT_STALL_sum      TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum
-pmc: TCC_TOO_MANY_EA_WRREQS_STALL_sum       TCC_EA_ATOMIC_sum                     TCC_EA_RDREQ_sum                       TCC_EA_RDREQ_32B_sum
-pmc: TCC_EA_RD_UNCACHED_32B_sum             TCC_EA_RDREQ_IO_CREDIT_STALL_sum      TCC_EA_RDREQ_GMI_CREDIT_STALL_sum      TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum
+pmc: TCC_EA_RDREQ_sum                       TCC_EA_RDREQ_32B_sum                  TCC_EA_RD_UNCACHED_32B_sum             TCC_EA_RDREQ_DRAM_sum
+pmc: TCC_EA_RDREQ_IO_CREDIT_STALL_sum       TCC_EA_RDREQ_GMI_CREDIT_STALL_sum     TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum
 pmc: TCC_TAG_STALL_sum                      TCC_NORMAL_WRITEBACK_sum              TCC_ALL_TC_OP_WB_WRITEBACK_sum         TCC_NORMAL_EVICT_sum
-pmc: TCC_ALL_TC_OP_INV_EVICT_sum            TCC_EA_RDREQ_DRAM_sum                 TCC_EA_WRREQ_DRAM_sum                    
+pmc: TCC_ALL_TC_OP_INV_EVICT_sum            TCC_TOO_MANY_EA_WRREQS_STALL_sum      TCC_EA_ATOMIC_sum        
 pmc: TCC_EA_RDREQ_LEVEL_sum                 TCC_EA_WRREQ_LEVEL_sum                TCC_EA_ATOMIC_LEVEL_sum
 
 gpu: 
diff --git a/src/perfmon_pub/mi200/pmc_tcc_perf.txt b/src/perfmon_pub/mi200/pmc_tcc_perf.txt
index 8a6d61de4..5586b0d3e 100644
--- a/src/perfmon_pub/mi200/pmc_tcc_perf.txt
+++ b/src/perfmon_pub/mi200/pmc_tcc_perf.txt
@@ -4,12 +4,12 @@ pmc: TCC_CYCLE_sum                          TCC_BUSY_sum
 pmc: TCC_NC_REQ_sum                         TCC_UC_REQ_sum                        TCC_CC_REQ_sum                         TCC_RW_REQ_sum
 pmc: TCC_REQ_sum                            TCC_STREAMING_REQ_sum                 TCC_HIT_sum                            TCC_MISS_sum
 pmc: TCC_READ_sum                           TCC_WRITE_sum                         TCC_ATOMIC_sum                         TCC_WRITEBACK_sum
-pmc: TCC_EA_WRREQ_sum                       TCC_EA_WRREQ_64B_sum                  TCC_EA_WR_UNCACHED_32B_sum 
+pmc: TCC_EA_WRREQ_sum                       TCC_EA_WRREQ_64B_sum                  TCC_EA_WR_UNCACHED_32B_sum             TCC_EA_WRREQ_DRAM_sum
 pmc: TCC_EA_WRREQ_STALL_sum                 TCC_EA_WRREQ_IO_CREDIT_STALL_sum      TCC_EA_WRREQ_GMI_CREDIT_STALL_sum      TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum
-pmc: TCC_TOO_MANY_EA_WRREQS_STALL_sum       TCC_EA_ATOMIC_sum                     TCC_EA_RDREQ_sum                       TCC_EA_RDREQ_32B_sum
-pmc: TCC_EA_RD_UNCACHED_32B_sum             TCC_EA_RDREQ_IO_CREDIT_STALL_sum      TCC_EA_RDREQ_GMI_CREDIT_STALL_sum      TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum
+pmc: TCC_EA_RDREQ_sum                       TCC_EA_RDREQ_32B_sum                  TCC_EA_RD_UNCACHED_32B_sum             TCC_EA_RDREQ_DRAM_sum
+pmc: TCC_EA_RDREQ_IO_CREDIT_STALL_sum       TCC_EA_RDREQ_GMI_CREDIT_STALL_sum     TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum
 pmc: TCC_TAG_STALL_sum                      TCC_NORMAL_WRITEBACK_sum              TCC_ALL_TC_OP_WB_WRITEBACK_sum         TCC_NORMAL_EVICT_sum
-pmc: TCC_ALL_TC_OP_INV_EVICT_sum            TCC_EA_RDREQ_DRAM_sum                 TCC_EA_WRREQ_DRAM_sum                    
+pmc: TCC_ALL_TC_OP_INV_EVICT_sum            TCC_TOO_MANY_EA_WRREQS_STALL_sum      TCC_EA_ATOMIC_sum
 pmc: TCC_EA_RDREQ_LEVEL_sum                 TCC_EA_WRREQ_LEVEL_sum                TCC_EA_ATOMIC_LEVEL_sum
 
 gpu: 
diff --git a/src/perfmon_pub/mi50/pmc_tcc_perf.txt b/src/perfmon_pub/mi50/pmc_tcc_perf.txt
index dd71aba6a..7e22f0445 100644
--- a/src/perfmon_pub/mi50/pmc_tcc_perf.txt
+++ b/src/perfmon_pub/mi50/pmc_tcc_perf.txt
@@ -4,12 +4,12 @@ pmc: TCC_CYCLE_sum                          TCC_BUSY_sum
 pmc: TCC_NC_REQ_sum                         TCC_UC_REQ_sum                        TCC_CC_REQ_sum                         
 pmc: TCC_REQ_sum                            TCC_STREAMING_REQ_sum                 TCC_HIT_sum                            TCC_MISS_sum
 pmc: TCC_READ_sum                           TCC_WRITE_sum                         TCC_ATOMIC_sum                         TCC_WRITEBACK_sum
-pmc: TCC_EA_WRREQ_sum                       TCC_EA_WRREQ_64B_sum                  TCC_EA_WR_UNCACHED_32B_sum 
+pmc: TCC_EA_WRREQ_sum                       TCC_EA_WRREQ_64B_sum                  TCC_EA_WR_UNCACHED_32B_sum             TCC_EA_WRREQ_DRAM_sum
 pmc: TCC_EA_WRREQ_STALL_sum                 TCC_EA_WRREQ_IO_CREDIT_STALL_sum      TCC_EA_WRREQ_GMI_CREDIT_STALL_sum      TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum
-pmc: TCC_TOO_MANY_EA_WRREQS_STALL_sum       TCC_EA_ATOMIC_sum                     TCC_EA_RDREQ_sum                       TCC_EA_RDREQ_32B_sum
-pmc: TCC_EA_RD_UNCACHED_32B_sum             TCC_EA_RDREQ_IO_CREDIT_STALL_sum      TCC_EA_RDREQ_GMI_CREDIT_STALL_sum      TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum
+pmc: TCC_EA_RDREQ_sum                       TCC_EA_RDREQ_32B_sum                  TCC_EA_RD_UNCACHED_32B_sum             TCC_EA_RDREQ_DRAM_sum
+pmc: TCC_EA_RDREQ_IO_CREDIT_STALL_sum       TCC_EA_RDREQ_GMI_CREDIT_STALL_sum     TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum
 pmc:                                        TCC_NORMAL_WRITEBACK_sum              TCC_ALL_TC_OP_WB_WRITEBACK_sum         TCC_NORMAL_EVICT_sum
-pmc: TCC_ALL_TC_OP_INV_EVICT_sum            TCC_EA_RDREQ_DRAM_sum                 TCC_EA_WRREQ_DRAM_sum                    
+pmc: TCC_ALL_TC_OP_INV_EVICT_sum            TCC_TOO_MANY_EA_WRREQS_STALL_sum      TCC_EA_ATOMIC_sum
 pmc: TCC_EA_RDREQ_LEVEL_sum                 TCC_EA_WRREQ_LEVEL_sum                TCC_EA_ATOMIC_LEVEL_sum
 
 gpu: 
diff --git a/src/soc_params/mi100.csv b/src/soc_params/mi100.csv
index c52a4e1bb..fd0c02cb1 100644
--- a/src/soc_params/mi100.csv
+++ b/src/soc_params/mi100.csv
@@ -1,2 +1,2 @@
-name,numSE,numCU,numSIMD,numWavesPerCU,numSQC,L2Banks,LDSBanks,Freq,mclk
-mi100,8,120,480,40,30,32,32,1502,1200
+name,numSE,numPipes,numCU,numSIMD,numWavesPerCU,numSQC,L2Banks,LDSBanks,Freq,mclk
+mi100,8,4,120,480,40,30,32,32,1502,1200
diff --git a/src/soc_params/mi200.csv b/src/soc_params/mi200.csv
index bf6343fc0..64faa3c0f 100644
--- a/src/soc_params/mi200.csv
+++ b/src/soc_params/mi200.csv
@@ -1,2 +1,2 @@
-name,numSE,numCU,numSIMD,numWavesPerCU,numSQC,L2Banks,LDSBanks,Freq,mclk
-mi200,8,110,440,32,56,32,32,1700,1600
+name,numSE,numPipes,numCU,numSIMD,numWavesPerCU,numSQC,L2Banks,LDSBanks,Freq,mclk
+mi200,8,4,110,440,32,56,32,32,1700,1600
diff --git a/src/soc_params/mi50.csv b/src/soc_params/mi50.csv
index f5e1bda0b..de62ad707 100644
--- a/src/soc_params/mi50.csv
+++ b/src/soc_params/mi50.csv
@@ -1,2 +1,2 @@
-name,numSE,numCU,numSIMD,numWavesPerCU,numSQC,L2Banks,LDSBanks,Freq,mclk
-mi50,4,60,240,40,15,16,32,1725,1000
+name,numSE,numPipes,numCU,numSIMD,numWavesPerCU,numSQC,L2Banks,LDSBanks,Freq,mclk
+mi50,4,4,60,240,40,15,16,32,1725,1000