Skip to content

Commit

Permalink
Support both row_major and col_major data layout
Browse files Browse the repository at this point in the history
Data layout of cuTenor is col_major by default. HipTensor contraction
only supports row_major for now. HipTensor supports both col_major
and row_major to make users be able to choose which layout they want to
use.

- Use CMake option `HIPTENSOR_DATA_LAYOUT_COL_MAJOR` to choose layout
  • Loading branch information
CongMa13 committed Nov 16, 2023
1 parent bf1c7ee commit 082a966
Show file tree
Hide file tree
Showing 7 changed files with 106 additions and 27 deletions.
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ set(CMAKE_CXX_EXTENSIONS OFF)
if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
option( HIPTENSOR_BUILD_TESTS "Build hiptensor tests" ON )
option( HIPTENSOR_BUILD_SAMPLES "Build hiptensor samples" ON )
option( HIPTENSOR_DATA_LAYOUT_COL_MAJOR "Set hiptensor data layout to column major" ON )
endif()

# Setup output paths
Expand Down Expand Up @@ -93,6 +94,13 @@ else()
endif()
message( VERBOSE "AMDGPU_TARGETS=${AMDGPU_TARGETS}")

if(HIPTENSOR_DATA_LAYOUT_COL_MAJOR)
add_compile_definitions(HIPTENSOR_DATA_LAYOUT_COL_MAJOR=1)
else()
add_compile_definitions(HIPTENSOR_DATA_LAYOUT_COL_MAJOR=0)
endif()
message("-- HIPTENSOR_DATA_LAYOUT_COL_MAJOR=${HIPTENSOR_DATA_LAYOUT_COL_MAJOR}")

# Setup HIP
find_package(hip REQUIRED )
message(STATUS "HIP version: ${hip_VERSION}")
Expand Down
2 changes: 1 addition & 1 deletion library/src/contraction/hiptensor_contraction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ hiptensorStatus_t hiptensorInitContractionDescriptor(const hiptensorHandle_t*
auto& logger = Logger::instance();

// Log API access
char msg[1024];
char msg[2048];
snprintf(
msg,
sizeof(msg),
Expand Down
2 changes: 1 addition & 1 deletion library/src/permutation/hiptensor_permutation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#include <hiptensor/hiptensor.hpp>

#include "logger.hpp"
#include "permutation_ck_col.hpp"
#include "permutation_ck.hpp"

hiptensorStatus_t hiptensorPermutation(const hiptensorHandle_t* handle,
const void* alpha,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,5 @@ namespace hiptensor
}
}

#include "permutation_ck_col_impl.hpp"
#include "permutation_ck_impl.hpp"
#endif // HIPTENSOR_PERMUTATION_CK_COL_HPP
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@
#ifndef HIPTENSOR_PERMUTATION_CK_COL_IMPL_HPP
#define HIPTENSOR_PERMUTATION_CK_COL_IMPL_HPP
#include <cstdlib>
#include <unordered_map>

#include <ck/ck.hpp>
#include <ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp>
#include <ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp>

#include "performance.hpp"
#include "types.hpp"

namespace hiptensor
Expand Down Expand Up @@ -74,35 +76,50 @@ namespace hiptensor
modeToLength[modeA[index]] = descA->mLengths[index];
}

float alphaValue = readVal<float>(alpha, typeScalar);
std::array<const void*, 1> input = {A};
std::array<void*, 1> output = {B};
std::unordered_map<int32_t, int32_t> bModeToStrides;
int32_t stride = 1;
bModeToStrides[modeB[0]] = stride;
#if HIPTENSOR_DATA_LAYOUT_COL_MAJOR
std::array<ck::index_t, 4> aStrides
= {1,
modeToLength[modeA[0]],
modeToLength[modeA[0]] * modeToLength[modeA[1]],
modeToLength[modeA[0]] * modeToLength[modeA[1]] * modeToLength[modeA[2]]};
int32_t stride = 1;
bModeToStrides[modeB[0]] = stride;
for(int32_t index = 1; index < modeSize; index++)
{
stride *= modeToLength[modeB[index - 1]];
bModeToStrides[modeB[index]] = stride;
}

float alphaValue = readVal<float>(alpha, typeScalar);
std::array<const void*, 1> input = {A};
std::array<void*, 1> output = {B};
std::array<ck::index_t, 4> a_strides
= {1,
modeToLength[modeA[0]],
modeToLength[modeA[0]] * modeToLength[modeA[1]],
modeToLength[modeA[0]] * modeToLength[modeA[1]] * modeToLength[modeA[2]]};
std::array<ck::index_t, 4> b_strides = {bModeToStrides[modeA[0]],
#else // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
std::array<ck::index_t, 4> aStrides = {
modeToLength[modeA[1]] * modeToLength[modeA[2]] * modeToLength[modeA[3]],
modeToLength[modeA[2]] * modeToLength[modeA[3]],
modeToLength[modeA[3]],
1,
};
int32_t stride = 1;
bModeToStrides[modeB[modeSize - 1]] = stride;
for(int32_t index = modeSize - 2; index >= 0; index--)
{
stride *= modeToLength[modeB[index + 1]];
bModeToStrides[modeB[index]] = stride;
}
#endif // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
std::array<ck::index_t, 4> bStrides = {bModeToStrides[modeA[0]],
bModeToStrides[modeA[1]],
bModeToStrides[modeA[2]],
bModeToStrides[modeA[3]]};
std::array<ck::index_t, 4> ab_lengths = {modeToLength[modeA[0]],
std::array<ck::index_t, 4> abLengths = {modeToLength[modeA[0]],
modeToLength[modeA[1]],
modeToLength[modeA[2]],
modeToLength[modeA[3]]};
auto broadcastPermute = DeviceElementwisePermuteInstance{};
auto argument = broadcastPermute.MakeArgumentPointer(ab_lengths,
{a_strides},
{b_strides},
auto argument = broadcastPermute.MakeArgumentPointer(abLengths,
{aStrides},
{bStrides},
input,
output,
PassThrough{},
Expand All @@ -115,7 +132,44 @@ namespace hiptensor
};

auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{stream, false});

// Perform contraction with timing if LOG_LEVEL_PERF_TRACE
using hiptensor::Logger;
auto& logger = Logger::instance();
bool measurePermuteTime = logger->getLogMask() & HIPTENSOR_LOG_LEVEL_PERF_TRACE;

auto permuteTime = broadcastPermute_invoker_ptr->Run(
argument.get(), StreamConfig{stream, measurePermuteTime});
if(measurePermuteTime)
{
std::size_t problemSize
= std::accumulate(abLengths.begin(), abLengths.end(), 1, std::multiplies{});
std::size_t flops = std::size_t(2) * problemSize;

std::size_t bytes = 2 * sizeof(DataType) * problemSize;
float tflops = static_cast<float>(flops) / 1.E9 / permuteTime;
float bandwidth = bytes / 1.E6 / permuteTime;

hiptensor::PerfMetrics metrics = {
0, // id, permute has only one solution, set id to 0
"default solution", // name
permuteTime, // avg time
tflops, // tflops
bandwidth // BW
};

// log perf metrics (not name/id)
char msg[2048];
snprintf(msg,
sizeof(msg),
"KernelId: %lu KernelName: %s, %0.3f ms, %0.3f TFlops, %0.3f GB/s",
metrics.mKernelUid,
metrics.mKernelName.c_str(),
metrics.mAvgTimeMs,
metrics.mTflops,
metrics.mBandwidth);
logger->logPerformanceTrace("hiptensorPermutation", msg);
}
return HIPTENSOR_STATUS_SUCCESS;
}
}
Expand Down
22 changes: 19 additions & 3 deletions library/src/permutation/permutation_cpu_reference_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,26 +56,42 @@ namespace hiptensor
bModeToIndex[modeB[index]] = index;
}

auto& aLens = descA->mLengths;
// auto bStrides = descB->mStrides; // TODO descB->mStrides contains incorrect strides
auto bStrides = std::vector<int32_t>(modeSize, 1);
auto& aLens = descA->mLengths;
auto bStrides = std::vector<int32_t>(modeSize, 1);
#if HIPTENSOR_DATA_LAYOUT_COL_MAJOR
for(int i = 1; i < modeSize; i++)
{
bStrides[i] = descB->mLengths[i - 1] * bStrides[i - 1];
}
#else // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
for(int i = modeSize - 2; i >= 0; i--)
{
bStrides[i] = descB->mLengths[i + 1] * bStrides[i + 1];
}
#endif // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
auto bIndices = std::vector<int32_t>(modeSize, 0);
auto elementCount = hiptensor::elementsFromLengths(aLens);
float alphaValue = readVal<float>(alpha, typeScalar);
for(int elementIndex = 0; elementIndex < elementCount; elementIndex++)
{
auto index = elementIndex;
#if HIPTENSOR_DATA_LAYOUT_COL_MAJOR
for(int modeIndex = 0; modeIndex < modeSize; modeIndex++)
{
bIndices[bModeToIndex[modeA[modeIndex]]] = index % aLens[modeIndex];
index /= aLens[modeIndex];
}
auto bOffset
= std::inner_product(bIndices.begin(), bIndices.end(), bStrides.begin(), 0);
#else // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
for(int modeIndex = modeSize - 1; modeIndex >= 0; modeIndex--)
{
bIndices[bModeToIndex[modeA[modeIndex]]] = index % aLens[modeIndex];
index /= aLens[modeIndex];
}
auto bOffset
= std::inner_product(bIndices.rbegin(), bIndices.rend(), bStrides.rbegin(), 0);
#endif // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
B[bOffset] = static_cast<DataType>(A[elementIndex] * alphaValue);
}

Expand Down
9 changes: 5 additions & 4 deletions samples/02_permutation/permutation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,10 @@ int main()
int nmodeC = modeC.size();

std::unordered_map<int, int64_t> extent;
extent['h'] = 2;
extent['w'] = 3;
extent['c'] = 4;
extent['n'] = 5;
extent['h'] = 32;
extent['w'] = 33;
extent['c'] = 34;
extent['n'] = 35;

std::vector<int64_t> extentA;
for(auto mode : modeA)
Expand Down Expand Up @@ -107,6 +107,7 @@ int main()
hiptensorStatus_t err;
hiptensorHandle_t* handle;
CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));

hiptensorTensorDescriptor_t descA;
CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(
Expand Down

0 comments on commit 082a966

Please sign in to comment.