Rework of metrics and documentation for Omniperf profiling (#224)

* add updated metrics Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * update analyze utilities Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * add updated images Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * add updated samples Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * add updated docs Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * updated perfmon / soc params Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * add nick to authors Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * add examples to readme Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * add vmem/smem latencies from upstream, and document Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * add warning about stall metrics Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * Update licenses Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * Update license on vcopy Signed-off-by: Nicholas Curtis <nicurtis@amd.com> --------- Signed-off-by: Nicholas Curtis <nicurtis@amd.com> Co-authored-by: Nicholas Curtis <nicurtis@amd.com>
ROCm · Jan 29, 2024 · d27a415 · d27a415
1 parent 435aa2b
commit d27a415
Show file tree

Hide file tree

Showing 161 changed files with 14,227 additions and 2,284 deletions.
diff --git a/AUTHORS b/AUTHORS
@@ -9,3 +9,4 @@ Cole Ramos
 Fei Zheng
 Jose Santos
 Karl Schultz
+Nicholas Curtis
diff --git a/README.md b/README.md
@@ -23,6 +23,11 @@ contribution process.
 
 * Licensing information can be found in the [LICENSE](LICENSE) file.
 
+## Examples
+A set of guided exercises demonstrating kernel optimization using Omniperf can be found in the [amd/HPCTrainingExamples](https://github.com/amd/HPCTrainingExamples/tree/main/OmniperfExamples) repo.
+
+
+
 ## Development 
 
 Omniperf follows a

diff --git a/sample/common.h b/sample/common.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <iostream>
+
+#define hipCheck(stmt)                                                         \
+  do {                                                                         \
+    hipError_t err = stmt;                                                     \
+    if (err != hipSuccess) {                                                   \
+      char msg[256];                                                           \
+      sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt, __FILE__,   \
+              __FUNCTION__, __LINE__);                                         \
+      std::string errstring = hipGetErrorString(err);                          \
+      std::cerr << msg << "\t" << errstring << std::endl;                      \
+      throw std::runtime_error(msg);                                           \
+    }                                                                          \
+  } while (0)
diff --git a/sample/fabric.hip b/sample/fabric.hip
@@ -0,0 +1,315 @@
+/*
+##############################################################################bl
+# MIT License
+#
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+##############################################################################el
+
+
+
+A data-fabric exerciser example, written by Nicholas Curtis [AMD]
+
+The test allows the user to control the:
+  - The granularity of an allocation (Coarse vs Fine-grained),
+  - The owner of an allocation (local HBM, CPU DRAM or remote HBM),
+  - The size of an allocation (the default is ~4GiB), and
+  - The type of operation we are executing (read, write, atomics of various flavors)
+
+This lets the user explore the impact of these choices on the generated
+data-fabric traffic.
+*/
+
+
+#include <getopt.h>
+#include <hip/hip_runtime.h>
+
+#include <iostream>
+#include <vector>
+
+#include "common.h"
+
+enum class mtype : int { FineGrained = 0, CoarseGrained = 1, Undef = 3 };
+enum class mowner : int { Device = 0, Host = 1, Remote = 2, Undef = 3 };
+enum class mspace : int { Global = 0, Undef = 1 };
+enum class mop : int {
+  Read = 0,
+  Write = 1,
+  AtomicAdd = 2,
+  AtomicCas = 3,
+  AtomicOr = 4,
+  AtomicMax = 5,
+  Undef = 6
+};
+enum class mdata : int { Unsigned = 0, UnsignedLong = 1, Float = 2, Double = 3, Undef = 4 };
+
+template<typename T>
+T parse(const char* value) {
+  int ivalue = std::atoi(value);
+  if (ivalue < 0 || ivalue >= int(T::Undef)) {
+    throw std::runtime_error("bad enum value!");
+  }
+  return T(ivalue);
+}
+
+void parse(int argc, char** argv, mtype& mytype, mowner& myowner,
+           mspace& myspace, size_t& size, mop& myop, mdata& mydata,
+           int& remoteId) {
+  while (1) {
+    static struct option long_options[] = {
+        /* These options set a flag. */
+        {"type", required_argument, 0, 't'},
+        {"owner", required_argument, 0, 'o'},
+        {"size", required_argument, 0, 'z'},
+        {"op", required_argument, 0, 'p'},
+        {"remote", required_argument, 0, 'r'},
+        {"data", required_argument, 0, 'd'},
+        {0, 0, 0, 0}};
+    /* getopt_long stores the option index here. */
+    int option_index = 0;
+
+    int c =
+        getopt_long(argc, argv, "t:o:z:p:r:d:", long_options, &option_index);
+
+    /* Detect the end of the options. */
+    if (c == -1) break;
+
+    switch (c) {
+      case 't':
+        mytype = parse<mtype>(optarg);
+        break;
+
+      case 'o':
+        myowner = parse<mowner>(optarg);
+        break;
+
+      case 'z':
+        size = std::atoll(optarg);
+        break;
+
+      case 'p':
+        myop = parse<mop>(optarg);
+        break;
+
+      case 'r':
+        remoteId = std::atoi(optarg);
+        break;
+
+      case 'd':
+        mydata = parse<mdata>(optarg);
+        break;
+
+      case '?':
+        /* getopt_long already printed an error message. */
+        break;
+
+      default:
+        abort();
+    }
+  }
+  std::cout << "Using: " << std::endl;
+  std::cout << "\tmtype:"
+            << ((mytype == mtype::FineGrained) ? "FineGrained"
+                                               : "CoarseGrained")
+            << std::endl;
+  std::cout << "\tmowner:"
+            << ((myowner == mowner::Device)
+                    ? "Device"
+                    : ((myowner == mowner::Host) ? "Host" : "Remote"))
+            << std::endl;
+  std::cout << "\tmspace:Global" << std::endl;
+  std::cout << "\tmop:" << ((myop == mop::Read) ? "Read" : (myop == mop::Write ? "Write" : (myop == mop::AtomicAdd ? "Add" : (myop == mop::AtomicCas ? "CAS" : (myop == mop::AtomicOr ? "Or" : "Max"))))) << std::endl;
+  std::cout << "\tmdata:" << (mydata == mdata::Unsigned ? "Unsigned" : (mydata == mdata::UnsignedLong ? "Unsigned Long" : (mydata == mdata::Float ? "Float" : "Double"))) << std::endl;
+  std::cout << "\tremoteId:" << remoteId << std::endl;
+}
+
+// dummy intialization kernel
+__global__ void init() {}
+
+template <typename T>
+void alloc(mtype memory, mowner owner, T** ptr, size_t Nbytes, int devId,
+           int remoteId) {
+  bool is_device = (owner == mowner::Device) || (owner == mowner::Remote);
+  if (owner == mowner::Remote) {
+    // enable remote access
+    hipCheck(hipDeviceEnablePeerAccess(remoteId, 0));
+    // set id for alloc
+    hipCheck(hipSetDevice(remoteId));
+  }
+  init<<<1, 1>>>();
+
+  if (memory == mtype::FineGrained && is_device) {
+    hipCheck(
+        hipExtMallocWithFlags((void**)ptr, Nbytes, hipDeviceMallocFinegrained));
+  } else if (memory == mtype::CoarseGrained && is_device) {
+    hipCheck(hipMalloc(ptr, Nbytes));
+  } else if (memory == mtype::FineGrained && owner == mowner::Host) {
+    hipCheck(hipHostMalloc(ptr, Nbytes, hipHostMallocCoherent));
+  } else if (memory == mtype::CoarseGrained && owner == mowner::Host) {
+    hipCheck(hipHostMalloc(ptr, Nbytes, hipHostMallocNonCoherent));
+  } else {
+    assert(false && "unknown combo");
+  }
+
+  // set to random
+  std::vector<T> host(Nbytes / sizeof(T), T(0));
+  hipCheck(hipMemcpy(*ptr, &host[0], Nbytes,
+                  (is_device ? hipMemcpyHostToDevice : hipMemcpyHostToHost)));
+
+  if (owner == mowner::Remote) {
+    // reset id for execution
+    hipCheck(hipSetDevice(devId));
+  }
+}
+
+template <typename T>
+void release(mtype memory, mowner owner, T* ptr) {
+  bool is_device = (owner == mowner::Device) || (owner == mowner::Remote);
+  if (memory == mtype::FineGrained && is_device) {
+    hipCheck(hipFree(ptr));
+  } else if (memory == mtype::CoarseGrained && is_device) {
+    hipCheck(hipFree(ptr));
+  } else if (memory == mtype::FineGrained && owner == mowner::Host) {
+    hipCheck(hipHostFree(ptr));
+  } else if (memory == mtype::CoarseGrained && owner == mowner::Host) {
+    hipCheck(hipHostFree(ptr));
+  } else {
+    assert(false && "unknown combo");
+  }
+}
+
+// the main streaming kernel
+template <mop op, typename T, int repeats = 10>
+__global__ void kernel(T* x, size_t N, T zero, T foo) {
+  int sum = 0;
+  const size_t offset_start = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = 0; i < repeats; ++i) {
+    for (size_t offset = offset_start; offset < N;
+         offset += blockDim.x * gridDim.x) {
+      T uniq = (foo + offset) + i;
+      if constexpr (op == mop::Read) {
+        sum += x[offset];
+      } else if constexpr (op == mop::Write) {
+        x[offset] = (T)offset;
+      } else if constexpr (op == mop::AtomicAdd) {
+        atomicAdd(&x[offset], uniq);
+      } else if constexpr (op == mop::AtomicCas) {
+        atomicCAS(&x[offset], uniq, uniq);
+      } else if constexpr (op == mop::AtomicOr) {
+        atomicOr(&x[offset], uniq);
+      } else if constexpr (op == mop::AtomicMax) {
+        atomicMax(&x[offset], uniq);
+      }
+    }
+  }
+  if constexpr (op == mop::Read) {
+    if (sum != 0) {
+      x[offset_start] = sum;
+    }
+  }
+}
+
+template <mop op, typename T, int nrepeats = 10>
+void run_kernel(T* x, size_t size) {
+  if constexpr (op == mop::AtomicOr && std::is_floating_point_v<T>) {
+    throw std::runtime_error("bad");
+  } else {
+    kernel<op, T, nrepeats><<<4096, 1024>>>(x, size, 0, T(23456789));
+    // then run once for data collection
+    kernel<op, T, nrepeats><<<4096, 1024>>>(x, size, 0, T(23456789));
+  }
+}
+
+template <mop op, typename T>
+void run_atomic(mowner myowner, T* x, size_t size) {
+  if (myowner == mowner::Host) {
+    // speed it up
+    run_kernel<op, T, 1>(x, size / 10);
+  } else {
+    run_kernel<op, T>(x, size);
+  }
+}
+
+template <typename T>
+void run(mtype mytype, mspace myspace, mowner myowner, mop myop, int remoteId,
+         size_t size) {
+  int devId = 0;
+  if (myowner == mowner::Remote && remoteId == -1) {
+    // need to find a remote GPU
+    int ndevices;
+    hipCheck(hipGetDeviceCount(&ndevices));
+    if (ndevices <= 1) {
+      throw std::runtime_error(
+          "Need >=2 devices available for mowner = Remote");
+    }
+    for (int i = 0; i < ndevices; ++i) {
+      if (i != devId) {
+        remoteId = i;
+        break;
+      }
+    }
+  }
+
+  T* x;
+  alloc(mytype, myowner, &x, size * sizeof(T), devId, remoteId);
+
+  // run the kernel once for warmup
+  assert(4096 * 1024 < size);
+  if (myop == mop::Read) {
+    run_kernel<mop::Read>(x, size);
+  } else if (myop == mop::Write) {
+    run_kernel<mop::Write>(x, size);
+  } else if (myop == mop::AtomicAdd) {
+    run_atomic<mop::AtomicAdd>(myowner, x, size);
+  } else if (myop == mop::AtomicCas) {
+    run_atomic<mop::AtomicCas>(myowner, x, size);
+  } else if (myop == mop::AtomicOr) {
+    run_atomic<mop::AtomicOr>(myowner, x, size);
+  } else if (myop == mop::AtomicMax) {
+    run_atomic<mop::AtomicMax>(myowner, x, size);
+  } else {
+    throw std::runtime_error("bad");
+  }
+  hipCheck(hipDeviceSynchronize());
+  release(mytype, myowner, x);
+}
+
+int main(int argc, char** argv) {
+  mtype mytype = (mtype)0;
+  mspace myspace = (mspace)0;
+  mowner myowner = (mowner)0;
+  mop myop = (mop)0;
+  mdata mydata = (mdata)0;
+  int remoteId = -1;
+  size_t size = 1024ull * 1024ull *
+                1024ull;  // 4 GiB, purposefully much larger than caches.
+  parse(argc, argv, mytype, myowner, myspace, size, myop, mydata, remoteId);
+  if (mydata == mdata::Unsigned)
+    run<unsigned>(mytype, myspace, myowner, myop, remoteId, size);
+  else if (mydata == mdata::UnsignedLong)
+    run<unsigned long>(mytype, myspace, myowner, myop, remoteId, size);
+  else if (mydata == mdata::Float)
+    run<float>(mytype, myspace, myowner, myop, remoteId, size);
+  else if (mydata == mdata::Double)
+    run<double>(mytype, myspace, myowner, myop, remoteId, size);
+  else {
+    throw std::runtime_error("bad");
+  }
+}