-
Notifications
You must be signed in to change notification settings - Fork 46
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Rework of metrics and documentation for Omniperf profiling (#224)
* add updated metrics Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * update analyze utilities Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * add updated images Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * add updated samples Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * add updated docs Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * updated perfmon / soc params Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * add nick to authors Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * add examples to readme Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * add vmem/smem latencies from upstream, and document Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * add warning about stall metrics Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * Update licenses Signed-off-by: Nicholas Curtis <nicurtis@amd.com> * Update license on vcopy Signed-off-by: Nicholas Curtis <nicurtis@amd.com> --------- Signed-off-by: Nicholas Curtis <nicurtis@amd.com> Co-authored-by: Nicholas Curtis <nicurtis@amd.com>
- Loading branch information
1 parent
435aa2b
commit d27a415
Showing
161 changed files
with
14,227 additions
and
2,284 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,3 +9,4 @@ Cole Ramos | |
Fei Zheng | ||
Jose Santos | ||
Karl Schultz | ||
Nicholas Curtis |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#pragma once | ||
|
||
#include <hip/hip_runtime.h> | ||
#include <iostream> | ||
|
||
#define hipCheck(stmt) \ | ||
do { \ | ||
hipError_t err = stmt; \ | ||
if (err != hipSuccess) { \ | ||
char msg[256]; \ | ||
sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt, __FILE__, \ | ||
__FUNCTION__, __LINE__); \ | ||
std::string errstring = hipGetErrorString(err); \ | ||
std::cerr << msg << "\t" << errstring << std::endl; \ | ||
throw std::runtime_error(msg); \ | ||
} \ | ||
} while (0) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,315 @@ | ||
/* | ||
##############################################################################bl | ||
# MIT License | ||
# | ||
# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. | ||
# | ||
# Permission is hereby granted, free of charge, to any person obtaining a copy | ||
# of this software and associated documentation files (the "Software"), to deal | ||
# in the Software without restriction, including without limitation the rights | ||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
# copies of the Software, and to permit persons to whom the Software is | ||
# furnished to do so, subject to the following conditions: | ||
# | ||
# The above copyright notice and this permission notice shall be included in all | ||
# copies or substantial portions of the Software. | ||
# | ||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
# SOFTWARE. | ||
##############################################################################el | ||
|
||
|
||
|
||
A data-fabric exerciser example, written by Nicholas Curtis [AMD] | ||
|
||
The test allows the user to control the: | ||
- The granularity of an allocation (Coarse vs Fine-grained), | ||
- The owner of an allocation (local HBM, CPU DRAM or remote HBM), | ||
- The size of an allocation (the default is ~4GiB), and | ||
- The type of operation we are executing (read, write, atomics of various flavors) | ||
|
||
This lets the user explore the impact of these choices on the generated | ||
data-fabric traffic. | ||
*/ | ||
|
||
|
||
#include <getopt.h> | ||
#include <hip/hip_runtime.h> | ||
|
||
#include <iostream> | ||
#include <vector> | ||
|
||
#include "common.h" | ||
|
||
enum class mtype : int { FineGrained = 0, CoarseGrained = 1, Undef = 3 }; | ||
enum class mowner : int { Device = 0, Host = 1, Remote = 2, Undef = 3 }; | ||
enum class mspace : int { Global = 0, Undef = 1 }; | ||
enum class mop : int { | ||
Read = 0, | ||
Write = 1, | ||
AtomicAdd = 2, | ||
AtomicCas = 3, | ||
AtomicOr = 4, | ||
AtomicMax = 5, | ||
Undef = 6 | ||
}; | ||
enum class mdata : int { Unsigned = 0, UnsignedLong = 1, Float = 2, Double = 3, Undef = 4 }; | ||
|
||
template<typename T> | ||
T parse(const char* value) { | ||
int ivalue = std::atoi(value); | ||
if (ivalue < 0 || ivalue >= int(T::Undef)) { | ||
throw std::runtime_error("bad enum value!"); | ||
} | ||
return T(ivalue); | ||
} | ||
|
||
void parse(int argc, char** argv, mtype& mytype, mowner& myowner, | ||
mspace& myspace, size_t& size, mop& myop, mdata& mydata, | ||
int& remoteId) { | ||
while (1) { | ||
static struct option long_options[] = { | ||
/* These options set a flag. */ | ||
{"type", required_argument, 0, 't'}, | ||
{"owner", required_argument, 0, 'o'}, | ||
{"size", required_argument, 0, 'z'}, | ||
{"op", required_argument, 0, 'p'}, | ||
{"remote", required_argument, 0, 'r'}, | ||
{"data", required_argument, 0, 'd'}, | ||
{0, 0, 0, 0}}; | ||
/* getopt_long stores the option index here. */ | ||
int option_index = 0; | ||
|
||
int c = | ||
getopt_long(argc, argv, "t:o:z:p:r:d:", long_options, &option_index); | ||
|
||
/* Detect the end of the options. */ | ||
if (c == -1) break; | ||
|
||
switch (c) { | ||
case 't': | ||
mytype = parse<mtype>(optarg); | ||
break; | ||
|
||
case 'o': | ||
myowner = parse<mowner>(optarg); | ||
break; | ||
|
||
case 'z': | ||
size = std::atoll(optarg); | ||
break; | ||
|
||
case 'p': | ||
myop = parse<mop>(optarg); | ||
break; | ||
|
||
case 'r': | ||
remoteId = std::atoi(optarg); | ||
break; | ||
|
||
case 'd': | ||
mydata = parse<mdata>(optarg); | ||
break; | ||
|
||
case '?': | ||
/* getopt_long already printed an error message. */ | ||
break; | ||
|
||
default: | ||
abort(); | ||
} | ||
} | ||
std::cout << "Using: " << std::endl; | ||
std::cout << "\tmtype:" | ||
<< ((mytype == mtype::FineGrained) ? "FineGrained" | ||
: "CoarseGrained") | ||
<< std::endl; | ||
std::cout << "\tmowner:" | ||
<< ((myowner == mowner::Device) | ||
? "Device" | ||
: ((myowner == mowner::Host) ? "Host" : "Remote")) | ||
<< std::endl; | ||
std::cout << "\tmspace:Global" << std::endl; | ||
std::cout << "\tmop:" << ((myop == mop::Read) ? "Read" : (myop == mop::Write ? "Write" : (myop == mop::AtomicAdd ? "Add" : (myop == mop::AtomicCas ? "CAS" : (myop == mop::AtomicOr ? "Or" : "Max"))))) << std::endl; | ||
std::cout << "\tmdata:" << (mydata == mdata::Unsigned ? "Unsigned" : (mydata == mdata::UnsignedLong ? "Unsigned Long" : (mydata == mdata::Float ? "Float" : "Double"))) << std::endl; | ||
std::cout << "\tremoteId:" << remoteId << std::endl; | ||
} | ||
|
||
// dummy intialization kernel | ||
__global__ void init() {} | ||
|
||
template <typename T> | ||
void alloc(mtype memory, mowner owner, T** ptr, size_t Nbytes, int devId, | ||
int remoteId) { | ||
bool is_device = (owner == mowner::Device) || (owner == mowner::Remote); | ||
if (owner == mowner::Remote) { | ||
// enable remote access | ||
hipCheck(hipDeviceEnablePeerAccess(remoteId, 0)); | ||
// set id for alloc | ||
hipCheck(hipSetDevice(remoteId)); | ||
} | ||
init<<<1, 1>>>(); | ||
|
||
if (memory == mtype::FineGrained && is_device) { | ||
hipCheck( | ||
hipExtMallocWithFlags((void**)ptr, Nbytes, hipDeviceMallocFinegrained)); | ||
} else if (memory == mtype::CoarseGrained && is_device) { | ||
hipCheck(hipMalloc(ptr, Nbytes)); | ||
} else if (memory == mtype::FineGrained && owner == mowner::Host) { | ||
hipCheck(hipHostMalloc(ptr, Nbytes, hipHostMallocCoherent)); | ||
} else if (memory == mtype::CoarseGrained && owner == mowner::Host) { | ||
hipCheck(hipHostMalloc(ptr, Nbytes, hipHostMallocNonCoherent)); | ||
} else { | ||
assert(false && "unknown combo"); | ||
} | ||
|
||
// set to random | ||
std::vector<T> host(Nbytes / sizeof(T), T(0)); | ||
hipCheck(hipMemcpy(*ptr, &host[0], Nbytes, | ||
(is_device ? hipMemcpyHostToDevice : hipMemcpyHostToHost))); | ||
|
||
if (owner == mowner::Remote) { | ||
// reset id for execution | ||
hipCheck(hipSetDevice(devId)); | ||
} | ||
} | ||
|
||
template <typename T> | ||
void release(mtype memory, mowner owner, T* ptr) { | ||
bool is_device = (owner == mowner::Device) || (owner == mowner::Remote); | ||
if (memory == mtype::FineGrained && is_device) { | ||
hipCheck(hipFree(ptr)); | ||
} else if (memory == mtype::CoarseGrained && is_device) { | ||
hipCheck(hipFree(ptr)); | ||
} else if (memory == mtype::FineGrained && owner == mowner::Host) { | ||
hipCheck(hipHostFree(ptr)); | ||
} else if (memory == mtype::CoarseGrained && owner == mowner::Host) { | ||
hipCheck(hipHostFree(ptr)); | ||
} else { | ||
assert(false && "unknown combo"); | ||
} | ||
} | ||
|
||
// the main streaming kernel | ||
template <mop op, typename T, int repeats = 10> | ||
__global__ void kernel(T* x, size_t N, T zero, T foo) { | ||
int sum = 0; | ||
const size_t offset_start = threadIdx.x + blockIdx.x * blockDim.x; | ||
for (int i = 0; i < repeats; ++i) { | ||
for (size_t offset = offset_start; offset < N; | ||
offset += blockDim.x * gridDim.x) { | ||
T uniq = (foo + offset) + i; | ||
if constexpr (op == mop::Read) { | ||
sum += x[offset]; | ||
} else if constexpr (op == mop::Write) { | ||
x[offset] = (T)offset; | ||
} else if constexpr (op == mop::AtomicAdd) { | ||
atomicAdd(&x[offset], uniq); | ||
} else if constexpr (op == mop::AtomicCas) { | ||
atomicCAS(&x[offset], uniq, uniq); | ||
} else if constexpr (op == mop::AtomicOr) { | ||
atomicOr(&x[offset], uniq); | ||
} else if constexpr (op == mop::AtomicMax) { | ||
atomicMax(&x[offset], uniq); | ||
} | ||
} | ||
} | ||
if constexpr (op == mop::Read) { | ||
if (sum != 0) { | ||
x[offset_start] = sum; | ||
} | ||
} | ||
} | ||
|
||
template <mop op, typename T, int nrepeats = 10> | ||
void run_kernel(T* x, size_t size) { | ||
if constexpr (op == mop::AtomicOr && std::is_floating_point_v<T>) { | ||
throw std::runtime_error("bad"); | ||
} else { | ||
kernel<op, T, nrepeats><<<4096, 1024>>>(x, size, 0, T(23456789)); | ||
// then run once for data collection | ||
kernel<op, T, nrepeats><<<4096, 1024>>>(x, size, 0, T(23456789)); | ||
} | ||
} | ||
|
||
template <mop op, typename T> | ||
void run_atomic(mowner myowner, T* x, size_t size) { | ||
if (myowner == mowner::Host) { | ||
// speed it up | ||
run_kernel<op, T, 1>(x, size / 10); | ||
} else { | ||
run_kernel<op, T>(x, size); | ||
} | ||
} | ||
|
||
template <typename T> | ||
void run(mtype mytype, mspace myspace, mowner myowner, mop myop, int remoteId, | ||
size_t size) { | ||
int devId = 0; | ||
if (myowner == mowner::Remote && remoteId == -1) { | ||
// need to find a remote GPU | ||
int ndevices; | ||
hipCheck(hipGetDeviceCount(&ndevices)); | ||
if (ndevices <= 1) { | ||
throw std::runtime_error( | ||
"Need >=2 devices available for mowner = Remote"); | ||
} | ||
for (int i = 0; i < ndevices; ++i) { | ||
if (i != devId) { | ||
remoteId = i; | ||
break; | ||
} | ||
} | ||
} | ||
|
||
T* x; | ||
alloc(mytype, myowner, &x, size * sizeof(T), devId, remoteId); | ||
|
||
// run the kernel once for warmup | ||
assert(4096 * 1024 < size); | ||
if (myop == mop::Read) { | ||
run_kernel<mop::Read>(x, size); | ||
} else if (myop == mop::Write) { | ||
run_kernel<mop::Write>(x, size); | ||
} else if (myop == mop::AtomicAdd) { | ||
run_atomic<mop::AtomicAdd>(myowner, x, size); | ||
} else if (myop == mop::AtomicCas) { | ||
run_atomic<mop::AtomicCas>(myowner, x, size); | ||
} else if (myop == mop::AtomicOr) { | ||
run_atomic<mop::AtomicOr>(myowner, x, size); | ||
} else if (myop == mop::AtomicMax) { | ||
run_atomic<mop::AtomicMax>(myowner, x, size); | ||
} else { | ||
throw std::runtime_error("bad"); | ||
} | ||
hipCheck(hipDeviceSynchronize()); | ||
release(mytype, myowner, x); | ||
} | ||
|
||
int main(int argc, char** argv) { | ||
mtype mytype = (mtype)0; | ||
mspace myspace = (mspace)0; | ||
mowner myowner = (mowner)0; | ||
mop myop = (mop)0; | ||
mdata mydata = (mdata)0; | ||
int remoteId = -1; | ||
size_t size = 1024ull * 1024ull * | ||
1024ull; // 4 GiB, purposefully much larger than caches. | ||
parse(argc, argv, mytype, myowner, myspace, size, myop, mydata, remoteId); | ||
if (mydata == mdata::Unsigned) | ||
run<unsigned>(mytype, myspace, myowner, myop, remoteId, size); | ||
else if (mydata == mdata::UnsignedLong) | ||
run<unsigned long>(mytype, myspace, myowner, myop, remoteId, size); | ||
else if (mydata == mdata::Float) | ||
run<float>(mytype, myspace, myowner, myop, remoteId, size); | ||
else if (mydata == mdata::Double) | ||
run<double>(mytype, myspace, myowner, myop, remoteId, size); | ||
else { | ||
throw std::runtime_error("bad"); | ||
} | ||
} |
Oops, something went wrong.