diff --git a/CMakeLists.txt b/CMakeLists.txt index c88bd54f..6fd11fb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,6 +60,7 @@ set(CMAKE_CXX_EXTENSIONS OFF) if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" ) option( HIPTENSOR_BUILD_TESTS "Build hiptensor tests" ON ) option( HIPTENSOR_BUILD_SAMPLES "Build hiptensor samples" ON ) + option( HIPTENSOR_DATA_LAYOUT_COL_MAJOR "Set hiptensor data layout to column major" ON ) endif() # Setup output paths @@ -93,6 +94,13 @@ else() endif() message( VERBOSE "AMDGPU_TARGETS=${AMDGPU_TARGETS}") +if(HIPTENSOR_DATA_LAYOUT_COL_MAJOR) + add_compile_definitions(HIPTENSOR_DATA_LAYOUT_COL_MAJOR=1) +else() + add_compile_definitions(HIPTENSOR_DATA_LAYOUT_COL_MAJOR=0) +endif() +message("-- HIPTENSOR_DATA_LAYOUT_COL_MAJOR=${HIPTENSOR_DATA_LAYOUT_COL_MAJOR}") + # Setup HIP find_package(hip REQUIRED ) message(STATUS "HIP version: ${hip_VERSION}") diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp index 078689ea..09f5ddf6 100644 --- a/library/src/contraction/hiptensor_contraction.cpp +++ b/library/src/contraction/hiptensor_contraction.cpp @@ -87,7 +87,7 @@ hiptensorStatus_t hiptensorInitContractionDescriptor(const hiptensorHandle_t* auto& logger = Logger::instance(); // Log API access - char msg[1024]; + char msg[2048]; snprintf( msg, sizeof(msg), diff --git a/library/src/permutation/hiptensor_permutation.cpp b/library/src/permutation/hiptensor_permutation.cpp index b3d60379..2b092655 100644 --- a/library/src/permutation/hiptensor_permutation.cpp +++ b/library/src/permutation/hiptensor_permutation.cpp @@ -26,7 +26,7 @@ #include #include "logger.hpp" -#include "permutation_ck_col.hpp" +#include "permutation_ck.hpp" hiptensorStatus_t hiptensorPermutation(const hiptensorHandle_t* handle, const void* alpha, diff --git a/library/src/permutation/permutation_ck_col.hpp b/library/src/permutation/permutation_ck.hpp similarity index 98% rename from library/src/permutation/permutation_ck_col.hpp rename to library/src/permutation/permutation_ck.hpp index 600658c6..8fa4959c 100644 --- a/library/src/permutation/permutation_ck_col.hpp +++ b/library/src/permutation/permutation_ck.hpp @@ -44,5 +44,5 @@ namespace hiptensor } } -#include "permutation_ck_col_impl.hpp" +#include "permutation_ck_impl.hpp" #endif // HIPTENSOR_PERMUTATION_CK_COL_HPP diff --git a/library/src/permutation/permutation_ck_col_impl.hpp b/library/src/permutation/permutation_ck_impl.hpp similarity index 64% rename from library/src/permutation/permutation_ck_col_impl.hpp rename to library/src/permutation/permutation_ck_impl.hpp index 7f16bb8a..7878e908 100644 --- a/library/src/permutation/permutation_ck_col_impl.hpp +++ b/library/src/permutation/permutation_ck_impl.hpp @@ -26,11 +26,13 @@ #ifndef HIPTENSOR_PERMUTATION_CK_COL_IMPL_HPP #define HIPTENSOR_PERMUTATION_CK_COL_IMPL_HPP #include +#include #include #include #include +#include "performance.hpp" #include "types.hpp" namespace hiptensor @@ -74,35 +76,50 @@ namespace hiptensor modeToLength[modeA[index]] = descA->mLengths[index]; } + float alphaValue = readVal(alpha, typeScalar); + std::array input = {A}; + std::array output = {B}; std::unordered_map bModeToStrides; - int32_t stride = 1; - bModeToStrides[modeB[0]] = stride; +#if HIPTENSOR_DATA_LAYOUT_COL_MAJOR + std::array aStrides + = {1, + modeToLength[modeA[0]], + modeToLength[modeA[0]] * modeToLength[modeA[1]], + modeToLength[modeA[0]] * modeToLength[modeA[1]] * modeToLength[modeA[2]]}; + int32_t stride = 1; + bModeToStrides[modeB[0]] = stride; for(int32_t index = 1; index < modeSize; index++) { stride *= modeToLength[modeB[index - 1]]; bModeToStrides[modeB[index]] = stride; } - - float alphaValue = readVal(alpha, typeScalar); - std::array input = {A}; - std::array output = {B}; - std::array a_strides - = {1, - modeToLength[modeA[0]], - modeToLength[modeA[0]] * modeToLength[modeA[1]], - modeToLength[modeA[0]] * modeToLength[modeA[1]] * modeToLength[modeA[2]]}; - std::array b_strides = {bModeToStrides[modeA[0]], +#else // HIPTENSOR_DATA_LAYOUT_COL_MAJOR + std::array aStrides = { + modeToLength[modeA[1]] * modeToLength[modeA[2]] * modeToLength[modeA[3]], + modeToLength[modeA[2]] * modeToLength[modeA[3]], + modeToLength[modeA[3]], + 1, + }; + int32_t stride = 1; + bModeToStrides[modeB[modeSize - 1]] = stride; + for(int32_t index = modeSize - 2; index >= 0; index--) + { + stride *= modeToLength[modeB[index + 1]]; + bModeToStrides[modeB[index]] = stride; + } +#endif // HIPTENSOR_DATA_LAYOUT_COL_MAJOR + std::array bStrides = {bModeToStrides[modeA[0]], bModeToStrides[modeA[1]], bModeToStrides[modeA[2]], bModeToStrides[modeA[3]]}; - std::array ab_lengths = {modeToLength[modeA[0]], + std::array abLengths = {modeToLength[modeA[0]], modeToLength[modeA[1]], modeToLength[modeA[2]], modeToLength[modeA[3]]}; auto broadcastPermute = DeviceElementwisePermuteInstance{}; - auto argument = broadcastPermute.MakeArgumentPointer(ab_lengths, - {a_strides}, - {b_strides}, + auto argument = broadcastPermute.MakeArgumentPointer(abLengths, + {aStrides}, + {bStrides}, input, output, PassThrough{}, @@ -115,7 +132,44 @@ namespace hiptensor }; auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer(); - broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{stream, false}); + + // Perform contraction with timing if LOG_LEVEL_PERF_TRACE + using hiptensor::Logger; + auto& logger = Logger::instance(); + bool measurePermuteTime = logger->getLogMask() & HIPTENSOR_LOG_LEVEL_PERF_TRACE; + + auto permuteTime = broadcastPermute_invoker_ptr->Run( + argument.get(), StreamConfig{stream, measurePermuteTime}); + if(measurePermuteTime) + { + std::size_t problemSize + = std::accumulate(abLengths.begin(), abLengths.end(), 1, std::multiplies{}); + std::size_t flops = std::size_t(2) * problemSize; + + std::size_t bytes = 2 * sizeof(DataType) * problemSize; + float tflops = static_cast(flops) / 1.E9 / permuteTime; + float bandwidth = bytes / 1.E6 / permuteTime; + + hiptensor::PerfMetrics metrics = { + 0, // id, permute has only one solution, set id to 0 + "default solution", // name + permuteTime, // avg time + tflops, // tflops + bandwidth // BW + }; + + // log perf metrics (not name/id) + char msg[2048]; + snprintf(msg, + sizeof(msg), + "KernelId: %lu KernelName: %s, %0.3f ms, %0.3f TFlops, %0.3f GB/s", + metrics.mKernelUid, + metrics.mKernelName.c_str(), + metrics.mAvgTimeMs, + metrics.mTflops, + metrics.mBandwidth); + logger->logPerformanceTrace("hiptensorPermutation", msg); + } return HIPTENSOR_STATUS_SUCCESS; } } diff --git a/library/src/permutation/permutation_cpu_reference_impl.hpp b/library/src/permutation/permutation_cpu_reference_impl.hpp index a04e4176..d64147fe 100644 --- a/library/src/permutation/permutation_cpu_reference_impl.hpp +++ b/library/src/permutation/permutation_cpu_reference_impl.hpp @@ -56,19 +56,26 @@ namespace hiptensor bModeToIndex[modeB[index]] = index; } - auto& aLens = descA->mLengths; - // auto bStrides = descB->mStrides; // TODO descB->mStrides contains incorrect strides - auto bStrides = std::vector(modeSize, 1); + auto& aLens = descA->mLengths; + auto bStrides = std::vector(modeSize, 1); +#if HIPTENSOR_DATA_LAYOUT_COL_MAJOR for(int i = 1; i < modeSize; i++) { bStrides[i] = descB->mLengths[i - 1] * bStrides[i - 1]; } +#else // HIPTENSOR_DATA_LAYOUT_COL_MAJOR + for(int i = modeSize - 2; i >= 0; i--) + { + bStrides[i] = descB->mLengths[i + 1] * bStrides[i + 1]; + } +#endif // HIPTENSOR_DATA_LAYOUT_COL_MAJOR auto bIndices = std::vector(modeSize, 0); auto elementCount = hiptensor::elementsFromLengths(aLens); float alphaValue = readVal(alpha, typeScalar); for(int elementIndex = 0; elementIndex < elementCount; elementIndex++) { auto index = elementIndex; +#if HIPTENSOR_DATA_LAYOUT_COL_MAJOR for(int modeIndex = 0; modeIndex < modeSize; modeIndex++) { bIndices[bModeToIndex[modeA[modeIndex]]] = index % aLens[modeIndex]; @@ -76,6 +83,15 @@ namespace hiptensor } auto bOffset = std::inner_product(bIndices.begin(), bIndices.end(), bStrides.begin(), 0); +#else // HIPTENSOR_DATA_LAYOUT_COL_MAJOR + for(int modeIndex = modeSize - 1; modeIndex >= 0; modeIndex--) + { + bIndices[bModeToIndex[modeA[modeIndex]]] = index % aLens[modeIndex]; + index /= aLens[modeIndex]; + } + auto bOffset + = std::inner_product(bIndices.rbegin(), bIndices.rend(), bStrides.rbegin(), 0); +#endif // HIPTENSOR_DATA_LAYOUT_COL_MAJOR B[bOffset] = static_cast(A[elementIndex] * alphaValue); } diff --git a/samples/02_permutation/permutation.cpp b/samples/02_permutation/permutation.cpp index 2c95d076..ffae2d33 100644 --- a/samples/02_permutation/permutation.cpp +++ b/samples/02_permutation/permutation.cpp @@ -63,10 +63,10 @@ int main() int nmodeC = modeC.size(); std::unordered_map extent; - extent['h'] = 2; - extent['w'] = 3; - extent['c'] = 4; - extent['n'] = 5; + extent['h'] = 32; + extent['w'] = 33; + extent['c'] = 34; + extent['n'] = 35; std::vector extentA; for(auto mode : modeA) @@ -107,6 +107,7 @@ int main() hiptensorStatus_t err; hiptensorHandle_t* handle; CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); + CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); hiptensorTensorDescriptor_t descA; CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(