From 074c0009ea1988fae56e3a9d2b77a93c042fae3c Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Tue, 31 Oct 2023 00:23:10 +0000
Subject: [PATCH 1/5] Add CPU implementation of elementwise permutation

- Add CPU implementation of elementwise permutation -  permutationByCpu
- Add sample of elementwise permutation

GPU implementation of elementwise permutation is not done yet.
---
 .../src/permutation/hiptensor_permutation.cpp |  14 +-
 .../permutation/permutation_cpu_reference.hpp |  50 +++++
 .../permutation_cpu_reference_impl.hpp        |  85 +++++++++
 .../simple_bilinear_contraction_f32.cpp       |   9 +-
 .../simple_scale_contraction_f32.cpp          |   9 +-
 samples/02_permutation/CMakeLists.txt         |  34 ++++
 samples/02_permutation/permutation.cpp        | 179 ++++++++++++++++++
 samples/CMakeLists.txt                        |   9 +-
 samples/{01_contraction => }/common.hpp       |   8 +-
 9 files changed, 380 insertions(+), 17 deletions(-)
 create mode 100644 library/src/permutation/permutation_cpu_reference.hpp
 create mode 100644 library/src/permutation/permutation_cpu_reference_impl.hpp
 create mode 100644 samples/02_permutation/CMakeLists.txt
 create mode 100644 samples/02_permutation/permutation.cpp
 rename samples/{01_contraction => }/common.hpp (93%)
diff --git a/library/src/permutation/hiptensor_permutation.cpp b/library/src/permutation/hiptensor_permutation.cpp
index ed4db184..493616a0 100644
--- a/library/src/permutation/hiptensor_permutation.cpp
+++ b/library/src/permutation/hiptensor_permutation.cpp
@@ -25,6 +25,8 @@
  *******************************************************************************/
 #include <hiptensor/hiptensor.hpp>
 
+#include "permutation_cpu_reference.hpp"
+
 hiptensorStatus_t hiptensorPermutation(const hiptensorHandle_t*           handle,
                                        const void*                        alpha,
                                        const void*                        A,
@@ -36,5 +38,15 @@ hiptensorStatus_t hiptensorPermutation(const hiptensorHandle_t*           handle
                                        const hipDataType                  typeScalar,
                                        const hipStream_t                  stream)
 {
-    return HIPTENSOR_STATUS_SUCCESS;
+    assert(descA->mType == HIP_R_16F || descA->mType == HIP_R_32F);
+    assert(descA->mType == descB->mType);
+    if(descA->mType == HIP_R_16F)
+    {
+        return hiptensor::detail::permuteByCpu(alpha, static_cast<const _Float16 *>(A), descA, modeA, static_cast<_Float16 *>(B), descB, modeB, typeScalar);
+    }
+    else if(descA->mType == HIP_R_32F)
+    {
+        return hiptensor::detail::permuteByCpu(alpha, static_cast<const float *>(A), descA, modeA, static_cast<float *>(B), descB, modeB, typeScalar);
+    }
+    return  HIPTENSOR_STATUS_NOT_SUPPORTED;
 }
diff --git a/library/src/permutation/permutation_cpu_reference.hpp b/library/src/permutation/permutation_cpu_reference.hpp
new file mode 100644
index 00000000..f7e335f2
--- /dev/null
+++ b/library/src/permutation/permutation_cpu_reference.hpp
@@ -0,0 +1,50 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_PERMUTATION_CPU_REFERENCE_HPP
+#define HIPTENSOR_PERMUTATION_CPU_REFERENCE_HPP
+
+#include <hip/library_types.h>
+
+#include <hiptensor/hiptensor.hpp>
+namespace hiptensor
+{
+    namespace detail
+    {
+        template <typename DataType>
+        hiptensorStatus_t permuteByCpu(const void*                        alpha,
+                                       const DataType*                    A,
+                                       const hiptensorTensorDescriptor_t* descA,
+                                       const int32_t                      modeA[],
+                                       DataType*                          B,
+                                       const hiptensorTensorDescriptor_t* descB,
+                                       const int32_t                      modeB[],
+                                       const hipDataType                  typeScalar);
+    }
+}
+
+#include "permutation_cpu_reference_impl.hpp"
+#endif // HIPTENSOR_PERMUTATION_CPU_REFERENCE_HPP
diff --git a/library/src/permutation/permutation_cpu_reference_impl.hpp b/library/src/permutation/permutation_cpu_reference_impl.hpp
new file mode 100644
index 00000000..6cc34e6b
--- /dev/null
+++ b/library/src/permutation/permutation_cpu_reference_impl.hpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTENSOR_PERMUTATION_CPU_REFERENCE_IMPL_HPP
+#define HIPTENSOR_PERMUTATION_CPU_REFERENCE_IMPL_HPP
+#include <hiptensor/hiptensor.hpp>
+#include <vector>
+#include <unordered_map>
+
+#include "permutation_cpu_reference.hpp"
+#include "types.hpp"
+#include "util.hpp"
+
+namespace hiptensor
+{
+    namespace detail
+    {
+        template <typename DataType>
+        hiptensorStatus_t permuteByCpu(const void*                        alpha,
+                                       const DataType*                    A,
+                                       const hiptensorTensorDescriptor_t* descA,
+                                       const int32_t                      modeA[],
+                                       DataType*                          B,
+                                       const hiptensorTensorDescriptor_t* descB,
+                                       const int32_t                      modeB[],
+                                       const hipDataType                  typeScalar)
+        {
+            const auto modeSize = descA->mLengths.size();
+            assert(modeSize <= 4);
+
+            std::unordered_map<int32_t, int32_t> bModeToIndex;
+            for(int32_t index = 0; index < modeSize; index++)
+            {
+                bModeToIndex[modeB[index]] = index;
+            }
+
+            auto& aLens = descA->mLengths;
+            // auto bStrides = descB->mStrides; // TODO descB->mStrides contains incorrect strides
+            auto bStrides = std::vector<int32_t>(modeSize, 1);
+            for(int i = 1; i < modeSize; i++)
+            {
+                bStrides[i] = descB->mLengths[i - 1] * bStrides[i - 1];
+            }
+            auto bIndices     = std::vector<int32_t>(modeSize, 0);
+            auto elementCount = hiptensor::elementsFromLengths(aLens);
+            for(int elementIndex = 0; elementIndex < elementCount; elementIndex++)
+            {
+                auto index = elementIndex;
+                for(int modeIndex = 0; modeIndex < modeSize; modeIndex++)
+                {
+                    bIndices[bModeToIndex[modeA[modeIndex]]] = index % aLens[modeIndex];
+                    index /= aLens[modeIndex];
+                }
+                auto bOffset
+                    = std::inner_product(bIndices.begin(), bIndices.end(), bStrides.begin(), 0);
+                B[bOffset] = A[elementIndex];
+            }
+
+            return HIPTENSOR_STATUS_SUCCESS;
+        }
+    }
+}
+#endif //HIPTENSOR_PERMUTATION_CPU_REFERENCE_IMPL_HPP
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f32.cpp
index 4781e72b..5704a59d 100644
--- a/samples/01_contraction/simple_bilinear_contraction_f32.cpp
+++ b/samples/01_contraction/simple_bilinear_contraction_f32.cpp
@@ -150,9 +150,12 @@ int main(int argc, char* argv[])
     size_t sizeB = sizeof(BDataType) * elementsB;
     size_t sizeC = sizeof(CDataType) * elementsC;
 
-    ADataType* A = (ADataType*)malloc(sizeA);
-    BDataType* B = (BDataType*)malloc(sizeB);
-    CDataType* C = (CDataType*)malloc(sizeC);
+    ADataType* A = nullptr;
+    BDataType* B = nullptr;
+    CDataType* C = nullptr;
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC));
 
     void *A_d, *B_d, *C_d;
 
diff --git a/samples/01_contraction/simple_scale_contraction_f32.cpp b/samples/01_contraction/simple_scale_contraction_f32.cpp
index e8c45e31..c76ec370 100644
--- a/samples/01_contraction/simple_scale_contraction_f32.cpp
+++ b/samples/01_contraction/simple_scale_contraction_f32.cpp
@@ -147,9 +147,12 @@ int main(int argc, char* argv[])
     size_t sizeB = sizeof(BDataType) * elementsB;
     size_t sizeD = sizeof(DDataType) * elementsD;
 
-    ADataType* A = (ADataType*)malloc(sizeA);
-    BDataType* B = (BDataType*)malloc(sizeB);
-    DDataType* D = (DDataType*)malloc(sizeD);
+    ADataType* A = nullptr;
+    BDataType* B = nullptr;
+    DDataType* D = nullptr;
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD));
 
     void *A_d, *B_d, *D_d;
 
diff --git a/samples/02_permutation/CMakeLists.txt b/samples/02_permutation/CMakeLists.txt
new file mode 100644
index 00000000..68857b54
--- /dev/null
+++ b/samples/02_permutation/CMakeLists.txt
@@ -0,0 +1,34 @@
+###############################################################################
+ #
+ # MIT License
+ #
+ # Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ #
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
+ # of this software and associated documentation files (the "Software"), to deal
+ # in the Software without restriction, including without limitation the rights
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ # copies of the Software, and to permit persons to whom the Software is
+ # furnished to do so, subject to the following conditions:
+ #
+ # The above copyright notice and this permission notice shall be included in
+ # all copies or substantial portions of the Software.
+ #
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ # THE SOFTWARE.
+ #
+ ###############################################################################
+
+# Check whether building within hiptensor context
+if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
+    add_hiptensor_sample(permutation permutation.cpp)
+# If building hipTensor samples as a standalone Cmake project
+else()
+    add_executable(permutation permutation.cpp)
+    target_link_libraries(permutation PRIVATE hiptensor::hiptensor)
+endif()
diff --git a/samples/02_permutation/permutation.cpp b/samples/02_permutation/permutation.cpp
new file mode 100644
index 00000000..2c95d076
--- /dev/null
+++ b/samples/02_permutation/permutation.cpp
@@ -0,0 +1,179 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/internal/hiptensor_utility.hpp>
+
+#include "common.hpp"
+
+int main()
+{
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef float floatTypeA;
+    typedef float floatTypeC;
+    typedef float floatTypeCompute;
+
+    hipDataType typeA       = HIP_R_32F;
+    hipDataType typeC       = HIP_R_32F;
+    hipDataType typeCompute = HIP_R_32F;
+
+    /**********************
+      B_{w, h, c, n} = 1.0 *  \textsl{IDENTITY}(A_{c, n, h, w})
+     **********************/
+
+    std::vector<int> modeA{'w', 'h', 'c', 'n'};
+    std::vector<int> modeC{'c', 'n', 'h', 'w'};
+    int              nmodeA = modeA.size();
+    int              nmodeC = modeC.size();
+
+    std::unordered_map<int, int64_t> extent;
+    extent['h'] = 2;
+    extent['w'] = 3;
+    extent['c'] = 4;
+    extent['n'] = 5;
+
+    std::vector<int64_t> extentA;
+    for(auto mode : modeA)
+        extentA.push_back(extent[mode]);
+    std::vector<int64_t> extentC;
+    for(auto mode : modeC)
+        extentC.push_back(extent[mode]);
+
+    /**********************
+     * Allocating data
+     **********************/
+
+    size_t elementsA = 1;
+    for(auto mode : modeA)
+        elementsA *= extent[mode];
+    size_t elementsC = 1;
+    for(auto mode : modeC)
+        elementsC *= extent[mode];
+
+    size_t sizeA = sizeof(floatTypeA) * elementsA;
+    size_t sizeC = sizeof(floatTypeC) * elementsC;
+
+    void *A_d, *C_d;
+    CHECK_HIP_ERROR(hipMalloc((void**)&A_d, sizeA));
+    CHECK_HIP_ERROR(hipMalloc((void**)&C_d, sizeC));
+
+    floatTypeA *A, *C;
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeof(floatTypeA) * elementsA));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeof(floatTypeC) * elementsC));
+
+    for(size_t i = 0; i < elementsA; i++)
+    {
+        A[i] = (float)i;
+    }
+
+    CHECK_HIP_ERROR(hipMemcpy(A_d, A, sizeA, hipMemcpyDefault));
+
+    hiptensorStatus_t  err;
+    hiptensorHandle_t* handle;
+    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
+
+    hiptensorTensorDescriptor_t descA;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(
+        handle, &descA, nmodeA, extentA.data(), NULL /* stride */, typeA, HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t descC;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(
+        handle, &descC, nmodeC, extentC.data(), NULL /* stride */, typeC, HIPTENSOR_OP_IDENTITY));
+
+    const floatTypeCompute one = 1.0f;
+    CHECK_HIPTENSOR_ERROR(hiptensorPermutation(handle,
+                                               &one,
+                                               A_d,
+                                               &descA,
+                                               modeA.data(),
+                                               C_d,
+                                               &descC,
+                                               modeC.data(),
+                                               typeCompute,
+                                               0 /* stream */));
+
+#if !NDEBUG
+    bool printElements = false;
+    bool storeElements = false;
+
+    if(printElements || storeElements)
+    {
+        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDefault));
+    }
+
+    if(printElements)
+    {
+        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor A elements:\n";
+            hiptensorPrintArrayElements(std::cout, A, elementsA);
+            std::cout << std::endl;
+        }
+
+        if(elementsC < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor C elements:\n";
+            hiptensorPrintArrayElements(std::cout, C, elementsC);
+            std::cout << std::endl;
+        }
+    }
+
+    if(storeElements)
+    {
+        std::ofstream tensorA, tensorB, tensorC;
+        tensorA.open("tensor_A.txt");
+        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
+        tensorA.close();
+
+        tensorC.open("tensor_C_scale_contraction_results.txt");
+        hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
+        tensorC.close();
+    }
+
+#endif
+
+    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
+    HIPTENSOR_FREE_HOST(A);
+    HIPTENSOR_FREE_HOST(C);
+    HIPTENSOR_FREE_DEVICE(A_d);
+    HIPTENSOR_FREE_DEVICE(C_d);
+
+    std::cout << "Finished!" << std::endl;
+    return 0;
+}
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 4914859b..ab06097b 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -40,6 +40,7 @@ if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
         target_link_libraries(${BINARY_NAME} PRIVATE hiptensor::hiptensor "-L${HIP_CLANG_ROOT}/lib" "-Wl,-rpath=$ORIGIN/../${CMAKE_INSTALL_LIBDIR}")
         target_include_directories(${BINARY_NAME} PRIVATE
                                 ${CMAKE_CURRENT_SOURCE_DIR}
+                                ${PROJECT_SOURCE_DIR}/samples
                                 ${PROJECT_SOURCE_DIR}/library/include)
 
         # Build this sample under custom target
@@ -51,10 +52,6 @@ if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
         COMPONENT samples
         )
     endfunction()
-
-    add_subdirectory(01_contraction)
-
-
 # If building hipTensor samples as a standalone Cmake project
 else()
     if(NOT CMAKE_CXX_COMPILER)
@@ -81,6 +78,6 @@ else()
     find_package( hip REQUIRED )
 
     message(STATUS "Build with HIP ${hip_VERSION}")
-
-    add_subdirectory(01_contraction)
 endif()
+add_subdirectory(01_contraction)
+add_subdirectory(02_permutation)
diff --git a/samples/01_contraction/common.hpp b/samples/common.hpp
similarity index 93%
rename from samples/01_contraction/common.hpp
rename to samples/common.hpp
index a7f75542..acb614c4 100644
--- a/samples/01_contraction/common.hpp
+++ b/samples/common.hpp
@@ -35,10 +35,10 @@
         CHECK_HIP_ERROR(hipFree(ptr)); \
     }
 
-#define HIPTENSOR_FREE_HOST(ptr) \
-    if(ptr != nullptr)           \
-    {                            \
-        free(ptr);               \
+#define HIPTENSOR_FREE_HOST(ptr)           \
+    if(ptr != nullptr)                     \
+    {                                      \
+        CHECK_HIP_ERROR(hipHostFree(ptr)); \
     }
 
 inline bool isF32Supported()

From 60f4b900b82704da2c9043f12e672017f71f7ef1 Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Wed, 1 Nov 2023 20:24:08 +0000
Subject: [PATCH 2/5] Move files used by all tests out of 01_contraction

moved 01_contraction/device/common.hpp -> device/common.hpp
renamed 01_contraction/common.hpp -> utils.hpp since there is already a
file named common.hpp in test folder
---
 test/01_contraction/contraction_test.cpp        | 2 +-
 test/01_contraction/contraction_test_params.hpp | 2 +-
 test/{01_contraction => }/device/common.hpp     | 0
 test/hiptensor_gtest_main.cpp                   | 2 +-
 test/{01_contraction/common.hpp => utils.hpp}   | 6 +++---
 5 files changed, 6 insertions(+), 6 deletions(-)
 rename test/{01_contraction => }/device/common.hpp (100%)
 rename test/{01_contraction/common.hpp => utils.hpp} (98%)

diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp
index ca67b8c1..790676cd 100644
--- a/test/01_contraction/contraction_test.cpp
+++ b/test/01_contraction/contraction_test.cpp
@@ -28,7 +28,7 @@
 #include "../library/src/include/types.hpp"
 #include "llvm/hiptensor_options.hpp"
 
-#include "common.hpp"
+#include "../utils.hpp"
 #include "contraction_cpu_reference.hpp"
 #include "contraction_test.hpp"
 
diff --git a/test/01_contraction/contraction_test_params.hpp b/test/01_contraction/contraction_test_params.hpp
index 952219ac..30c99349 100644
--- a/test/01_contraction/contraction_test_params.hpp
+++ b/test/01_contraction/contraction_test_params.hpp
@@ -33,7 +33,7 @@
 #include <hiptensor/hiptensor.hpp>
 #include <hiptensor/hiptensor_types.hpp>
 
-#include "common.hpp"
+#include "../utils.hpp"
 
 namespace hiptensor
 {
diff --git a/test/01_contraction/device/common.hpp b/test/device/common.hpp
similarity index 100%
rename from test/01_contraction/device/common.hpp
rename to test/device/common.hpp
diff --git a/test/hiptensor_gtest_main.cpp b/test/hiptensor_gtest_main.cpp
index 2fcaa275..2d8f1d45 100644
--- a/test/hiptensor_gtest_main.cpp
+++ b/test/hiptensor_gtest_main.cpp
@@ -23,7 +23,7 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include "01_contraction/common.hpp"
+#include "utils.hpp"
 #include "llvm/hiptensor_options.hpp"
 
 #include <gtest/gtest.h>
diff --git a/test/01_contraction/common.hpp b/test/utils.hpp
similarity index 98%
rename from test/01_contraction/common.hpp
rename to test/utils.hpp
index c31de3e6..421c1a04 100644
--- a/test/01_contraction/common.hpp
+++ b/test/utils.hpp
@@ -24,8 +24,8 @@
  *
  *******************************************************************************/
 
-#ifndef HIPTENSOR_TEST_CONTRACTION_COMMON_HPP
-#define HIPTENSOR_TEST_CONTRACTION_COMMON_HPP
+#ifndef HIPTENSOR_TEST_UTILS_HPP
+#define HIPTENSOR_TEST_UTILS_HPP
 
 #include <algorithm>
 #include <fstream>
@@ -292,4 +292,4 @@ namespace std
     }
 }
 
-#endif // HIPTENSOR_TEST_CONTRACTION_COMMON_HPP
+#endif // HIPTENSOR_TEST_UTILS_HPP

From bf1c7eed5ecfeb28f3f85dae62c1ed9d3dea7b69 Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Thu, 2 Nov 2023 14:24:53 +0000
Subject: [PATCH 3/5] Implement permutation with CK

- implemented `hiptensorPermutation` with CK
- added unit tests for `hiptensorPermutation`
---
 .gitignore                                    |   2 +-
 .../hiptensor/internal/hiptensor_utility.hpp  |   5 +
 library/src/hiptensor.cpp                     |   3 +-
 .../src/permutation/hiptensor_permutation.cpp | 131 +++++++-
 .../src/permutation/permutation_ck_col.hpp    |  48 +++
 .../permutation/permutation_ck_col_impl.hpp   | 123 +++++++
 .../permutation/permutation_cpu_reference.hpp |   3 -
 .../permutation_cpu_reference_impl.hpp        |   9 +-
 test/01_contraction/contraction_resource.hpp  |   4 +-
 test/01_contraction/contraction_test.cpp      |   6 +-
 .../contraction_test_params.hpp               |   2 +-
 test/02_permutation/CMakeLists.txt            |  35 ++
 test/02_permutation/configs/test_params.yaml  |  41 +++
 .../permutation_column_major_test.cpp}        |  32 +-
 test/02_permutation/permutation_resource.cpp  | 161 +++++++++
 test/02_permutation/permutation_resource.hpp  | 104 ++++++
 test/02_permutation/permutation_test.cpp      | 306 ++++++++++++++++++
 test/02_permutation/permutation_test.hpp      | 105 ++++++
 .../permutation_test_helpers.hpp              |  68 ++++
 .../permutation_test_params.hpp               |  95 ++++++
 test/CMakeLists.txt                           |   4 +-
 test/device/common.hpp                        |   2 +-
 test/llvm/CMakeLists.txt                      |   4 +-
 ...tion_config.cpp => yaml_parser_config.cpp} |  47 ++-
 test/utils.hpp                                |  45 ++-
 25 files changed, 1337 insertions(+), 48 deletions(-)
 create mode 100644 library/src/permutation/permutation_ck_col.hpp
 create mode 100644 library/src/permutation/permutation_ck_col_impl.hpp
 create mode 100644 test/02_permutation/CMakeLists.txt
 create mode 100644 test/02_permutation/configs/test_params.yaml
 rename test/{singleton.hpp => 02_permutation/permutation_column_major_test.cpp} (74%)
 create mode 100644 test/02_permutation/permutation_resource.cpp
 create mode 100644 test/02_permutation/permutation_resource.hpp
 create mode 100644 test/02_permutation/permutation_test.cpp
 create mode 100644 test/02_permutation/permutation_test.hpp
 create mode 100644 test/02_permutation/permutation_test_helpers.hpp
 create mode 100644 test/02_permutation/permutation_test_params.hpp
 rename test/llvm/{yaml_parser_contraction_config.cpp => yaml_parser_config.cpp} (85%)

diff --git a/.gitignore b/.gitignore
index ae21cc0c..ad44a303 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,7 @@ hiptensor_version.hpp
 hiptensor-version.hpp
 
 # Generated source file
-test/01_contraction/configs/*.hpp
+test/*/configs/*.hpp
 
 # Precompiled Headers
 *.gch
diff --git a/library/include/hiptensor/internal/hiptensor_utility.hpp b/library/include/hiptensor/internal/hiptensor_utility.hpp
index f2df2dd2..5fd9e331 100644
--- a/library/include/hiptensor/internal/hiptensor_utility.hpp
+++ b/library/include/hiptensor/internal/hiptensor_utility.hpp
@@ -131,6 +131,11 @@ namespace std
 
         return os;
     }
+    static ostream& operator<<(ostream& os, const _Float16 value)
+    {
+        os << static_cast<float>(value);
+        return os;
+    }
 }
 
 #endif // HIPTENSOR_UTILITY_INTERNAL_HPP
diff --git a/library/src/hiptensor.cpp b/library/src/hiptensor.cpp
index 6638f398..3016ef64 100644
--- a/library/src/hiptensor.cpp
+++ b/library/src/hiptensor.cpp
@@ -151,7 +151,8 @@ hiptensorStatus_t hiptensorInitTensorDescriptor(const hiptensorHandle_t*     han
         return HIPTENSOR_STATUS_NOT_INITIALIZED;
     }
 
-    if((lens == nullptr) || ((dataType != HIP_R_32F) && (dataType != HIP_R_64F))
+    if((lens == nullptr)
+       || ((dataType != HIP_R_16F) && (dataType != HIP_R_32F) && (dataType != HIP_R_64F))
        || unaryOp != HIPTENSOR_OP_IDENTITY)
     {
         auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE;
diff --git a/library/src/permutation/hiptensor_permutation.cpp b/library/src/permutation/hiptensor_permutation.cpp
index 493616a0..b3d60379 100644
--- a/library/src/permutation/hiptensor_permutation.cpp
+++ b/library/src/permutation/hiptensor_permutation.cpp
@@ -25,7 +25,8 @@
  *******************************************************************************/
 #include <hiptensor/hiptensor.hpp>
 
-#include "permutation_cpu_reference.hpp"
+#include "logger.hpp"
+#include "permutation_ck_col.hpp"
 
 hiptensorStatus_t hiptensorPermutation(const hiptensorHandle_t*           handle,
                                        const void*                        alpha,
@@ -38,15 +39,133 @@ hiptensorStatus_t hiptensorPermutation(const hiptensorHandle_t*           handle
                                        const hipDataType                  typeScalar,
                                        const hipStream_t                  stream)
 {
-    assert(descA->mType == HIP_R_16F || descA->mType == HIP_R_32F);
-    assert(descA->mType == descB->mType);
+    using hiptensor::Logger;
+    auto& logger = Logger::instance();
+
+    // Log API access
+    char msg[2048];
+    snprintf(msg,
+             sizeof(msg),
+             "handle=%p, alpha=%p, A=%p, descA=%p, modeA=%p, B=%p, descB=%p, modeB=%p, "
+             "typeScalar=0x%02X, stream=%p",
+             handle,
+             alpha,
+             A,
+             descA,
+             modeA,
+             B,
+             descB,
+             modeB,
+             (unsigned int)typeScalar,
+             stream);
+
+    logger->logAPITrace("hiptensorPermutation", msg);
+
+    if(!handle || !alpha || !A || !descA || !modeA || !B || !descB || !modeB)
+    {
+        auto errorCode         = HIPTENSOR_STATUS_NOT_INITIALIZED;
+        auto printErrorMessage = [&logger, errorCode](const std::string& paramName) {
+            char msg[512];
+            snprintf(msg,
+                     sizeof(msg),
+                     "Initialization Error : %s = nullptr (%s)",
+                     paramName.c_str(),
+                     hiptensorGetErrorString(errorCode));
+            logger->logError("hiptensorPermutation", msg);
+        };
+        if(!handle)
+        {
+            printErrorMessage("handle");
+        }
+        if(!alpha)
+        {
+            printErrorMessage("alpha");
+        }
+        if(!A)
+        {
+            printErrorMessage("A");
+        }
+        if(!descA)
+        {
+            printErrorMessage("descA");
+        }
+        if(!modeA)
+        {
+            printErrorMessage("modeA");
+        }
+        if(!B)
+        {
+            printErrorMessage("B");
+        }
+        if(!descB)
+        {
+            printErrorMessage("descB");
+        }
+        if(!modeB)
+        {
+            printErrorMessage("modeB");
+        }
+        return errorCode;
+    }
+
+    if(descA->mType != HIP_R_16F && descA->mType != HIP_R_32F)
+    {
+        auto errorCode = HIPTENSOR_STATUS_NOT_SUPPORTED;
+        snprintf(msg,
+                 sizeof(msg),
+                 "Unsupported Data Type Error : The supported data types of A and B are HIP_R_16F "
+                 "and HIP_R_32F (%s)",
+                 hiptensorGetErrorString(errorCode));
+        logger->logError("hiptensorPermutation", msg);
+        return errorCode;
+    }
+
+    if(descA->mType != descB->mType)
+    {
+        auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE;
+        snprintf(msg,
+                 sizeof(msg),
+                 "Mismatched Data Type Error : Data types of A and B are not the same. (%s)",
+                 hiptensorGetErrorString(errorCode));
+        logger->logError("hiptensorPermutation", msg);
+        return errorCode;
+    }
+
+    if(typeScalar != HIP_R_16F && typeScalar != HIP_R_32F)
+    {
+        auto errorCode = HIPTENSOR_STATUS_NOT_SUPPORTED;
+        snprintf(msg,
+                 sizeof(msg),
+                 "Unsupported Data Type Error : The supported data types of alpha are HIP_R_16F "
+                 "and HIP_R_32F (%s)",
+                 hiptensorGetErrorString(errorCode));
+        logger->logError("hiptensorPermutation", msg);
+        return errorCode;
+    }
+
     if(descA->mType == HIP_R_16F)
     {
-        return hiptensor::detail::permuteByCpu(alpha, static_cast<const _Float16 *>(A), descA, modeA, static_cast<_Float16 *>(B), descB, modeB, typeScalar);
+        return hiptensor::detail::permuteByCk(alpha,
+                                              static_cast<const _Float16*>(A),
+                                              descA,
+                                              modeA,
+                                              static_cast<_Float16*>(B),
+                                              descB,
+                                              modeB,
+                                              typeScalar,
+                                              stream);
     }
     else if(descA->mType == HIP_R_32F)
     {
-        return hiptensor::detail::permuteByCpu(alpha, static_cast<const float *>(A), descA, modeA, static_cast<float *>(B), descB, modeB, typeScalar);
+        return hiptensor::detail::permuteByCk(alpha,
+                                              static_cast<const float*>(A),
+                                              descA,
+                                              modeA,
+                                              static_cast<float*>(B),
+                                              descB,
+                                              modeB,
+                                              typeScalar,
+                                              stream);
     }
-    return  HIPTENSOR_STATUS_NOT_SUPPORTED;
+    return HIPTENSOR_STATUS_NOT_SUPPORTED;
 }
diff --git a/library/src/permutation/permutation_ck_col.hpp b/library/src/permutation/permutation_ck_col.hpp
new file mode 100644
index 00000000..600658c6
--- /dev/null
+++ b/library/src/permutation/permutation_ck_col.hpp
@@ -0,0 +1,48 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTENSOR_PERMUTATION_CK_COL_HPP
+#define HIPTENSOR_PERMUTATION_CK_COL_HPP
+#include <hiptensor/hiptensor.hpp>
+
+namespace hiptensor
+{
+    namespace detail
+    {
+        template <typename DataType>
+        hiptensorStatus_t permuteByCk(const void*                        alpha,
+                                      const DataType*                    A,
+                                      const hiptensorTensorDescriptor_t* descA,
+                                      const int32_t                      modeA[],
+                                      DataType*                          B,
+                                      const hiptensorTensorDescriptor_t* descB,
+                                      const int32_t                      modeB[],
+                                      const hipDataType                  typeScalar);
+
+    }
+}
+
+#include "permutation_ck_col_impl.hpp"
+#endif // HIPTENSOR_PERMUTATION_CK_COL_HPP
diff --git a/library/src/permutation/permutation_ck_col_impl.hpp b/library/src/permutation/permutation_ck_col_impl.hpp
new file mode 100644
index 00000000..7f16bb8a
--- /dev/null
+++ b/library/src/permutation/permutation_ck_col_impl.hpp
@@ -0,0 +1,123 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTENSOR_PERMUTATION_CK_COL_IMPL_HPP
+#define HIPTENSOR_PERMUTATION_CK_COL_IMPL_HPP
+#include <cstdlib>
+
+#include <ck/ck.hpp>
+#include <ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp>
+#include <ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp>
+
+#include "types.hpp"
+
+namespace hiptensor
+{
+    namespace detail
+    {
+        template <typename DataType>
+        hiptensorStatus_t permuteByCk(const void*                        alpha,
+                                      const DataType*                    A,
+                                      const hiptensorTensorDescriptor_t* descA,
+                                      const int32_t                      modeA[],
+                                      DataType*                          B,
+                                      const hiptensorTensorDescriptor_t* descB,
+                                      const int32_t                      modeB[],
+                                      const hipDataType                  typeScalar,
+                                      const hipStream_t                  stream)
+        {
+            using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+            using UnaryOp     = ck::tensor_operation::element_wise::PassThrough;
+            using Scale       = ck::tensor_operation::element_wise::Scale;
+            using DeviceElementwisePermuteInstance
+                = ck::tensor_operation::device::DeviceElementwiseImpl<
+                    ck::Tuple<DataType>, // InDataTypeTuple
+                    ck::Tuple<DataType>, // OutDataTypeTuple
+                    PassThrough, // ElementwiseOp
+                    UnaryOp, // UnaryOp
+                    Scale, // Scalar
+                    4, // NumDim
+                    1, // MPerThread
+                    ck::Sequence<1>, // InScalarPerVectorSeq
+                    ck::Sequence<1>>; // OutScalarPerVectorSeq
+
+            const auto modeSize = descA->mLengths.size();
+            assert(modeSize == 4);
+
+            std::unordered_map<int32_t, int32_t>
+                modeToLength; // for example {'n': 1, 'c': 2, 'w': 3, 'h':0}
+
+            for(int32_t index = 0; index < modeSize; index++)
+            {
+                modeToLength[modeA[index]] = descA->mLengths[index];
+            }
+
+            std::unordered_map<int32_t, int32_t> bModeToStrides;
+            int32_t                              stride = 1;
+            bModeToStrides[modeB[0]]                    = stride;
+            for(int32_t index = 1; index < modeSize; index++)
+            {
+                stride *= modeToLength[modeB[index - 1]];
+                bModeToStrides[modeB[index]] = stride;
+            }
+
+            float                      alphaValue = readVal<float>(alpha, typeScalar);
+            std::array<const void*, 1> input      = {A};
+            std::array<void*, 1>       output     = {B};
+            std::array<ck::index_t, 4> a_strides
+                = {1,
+                   modeToLength[modeA[0]],
+                   modeToLength[modeA[0]] * modeToLength[modeA[1]],
+                   modeToLength[modeA[0]] * modeToLength[modeA[1]] * modeToLength[modeA[2]]};
+            std::array<ck::index_t, 4> b_strides        = {bModeToStrides[modeA[0]],
+                                                           bModeToStrides[modeA[1]],
+                                                           bModeToStrides[modeA[2]],
+                                                           bModeToStrides[modeA[3]]};
+            std::array<ck::index_t, 4> ab_lengths       = {modeToLength[modeA[0]],
+                                                           modeToLength[modeA[1]],
+                                                           modeToLength[modeA[2]],
+                                                           modeToLength[modeA[3]]};
+            auto                       broadcastPermute = DeviceElementwisePermuteInstance{};
+            auto                       argument = broadcastPermute.MakeArgumentPointer(ab_lengths,
+                                                                                       {a_strides},
+                                                                                       {b_strides},
+                                                                 input,
+                                                                 output,
+                                                                 PassThrough{},
+                                                                 UnaryOp{},
+                                                                 Scale{alphaValue});
+
+            if(!broadcastPermute.IsSupportedArgument(argument.get()))
+            {
+                return HIPTENSOR_STATUS_NOT_SUPPORTED;
+            };
+
+            auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
+            broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{stream, false});
+            return HIPTENSOR_STATUS_SUCCESS;
+        }
+    }
+}
+#endif // HIPTENSOR_PERMUTATION_CK_COL_IMPL_HPP
diff --git a/library/src/permutation/permutation_cpu_reference.hpp b/library/src/permutation/permutation_cpu_reference.hpp
index f7e335f2..0bcff3cf 100644
--- a/library/src/permutation/permutation_cpu_reference.hpp
+++ b/library/src/permutation/permutation_cpu_reference.hpp
@@ -26,9 +26,6 @@
 
 #ifndef HIPTENSOR_PERMUTATION_CPU_REFERENCE_HPP
 #define HIPTENSOR_PERMUTATION_CPU_REFERENCE_HPP
-
-#include <hip/library_types.h>
-
 #include <hiptensor/hiptensor.hpp>
 namespace hiptensor
 {
diff --git a/library/src/permutation/permutation_cpu_reference_impl.hpp b/library/src/permutation/permutation_cpu_reference_impl.hpp
index 6cc34e6b..a04e4176 100644
--- a/library/src/permutation/permutation_cpu_reference_impl.hpp
+++ b/library/src/permutation/permutation_cpu_reference_impl.hpp
@@ -26,8 +26,8 @@
 #ifndef HIPTENSOR_PERMUTATION_CPU_REFERENCE_IMPL_HPP
 #define HIPTENSOR_PERMUTATION_CPU_REFERENCE_IMPL_HPP
 #include <hiptensor/hiptensor.hpp>
-#include <vector>
 #include <unordered_map>
+#include <vector>
 
 #include "permutation_cpu_reference.hpp"
 #include "types.hpp"
@@ -63,8 +63,9 @@ namespace hiptensor
             {
                 bStrides[i] = descB->mLengths[i - 1] * bStrides[i - 1];
             }
-            auto bIndices     = std::vector<int32_t>(modeSize, 0);
-            auto elementCount = hiptensor::elementsFromLengths(aLens);
+            auto  bIndices     = std::vector<int32_t>(modeSize, 0);
+            auto  elementCount = hiptensor::elementsFromLengths(aLens);
+            float alphaValue   = readVal<float>(alpha, typeScalar);
             for(int elementIndex = 0; elementIndex < elementCount; elementIndex++)
             {
                 auto index = elementIndex;
@@ -75,7 +76,7 @@ namespace hiptensor
                 }
                 auto bOffset
                     = std::inner_product(bIndices.begin(), bIndices.end(), bStrides.begin(), 0);
-                B[bOffset] = A[elementIndex];
+                B[bOffset] = static_cast<DataType>(A[elementIndex] * alphaValue);
             }
 
             return HIPTENSOR_STATUS_SUCCESS;
diff --git a/test/01_contraction/contraction_resource.hpp b/test/01_contraction/contraction_resource.hpp
index 88f13e88..2fd3e62c 100644
--- a/test/01_contraction/contraction_resource.hpp
+++ b/test/01_contraction/contraction_resource.hpp
@@ -30,8 +30,8 @@
 #include <memory>
 #include <tuple>
 
-#include "../hip_resource.hpp"
-#include "../singleton.hpp"
+#include "hip_resource.hpp"
+#include "singleton.hpp"
 
 // ContractionResource class is intended to manage a shared pool of resources for
 // testing hiptensor contraction kernels on the GPU.
diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp
index 790676cd..4ad97610 100644
--- a/test/01_contraction/contraction_test.cpp
+++ b/test/01_contraction/contraction_test.cpp
@@ -25,12 +25,12 @@
  *******************************************************************************/
 #include <hiptensor/hiptensor.hpp>
 
-#include "../library/src/include/types.hpp"
+#include "types.hpp"
 #include "llvm/hiptensor_options.hpp"
 
-#include "../utils.hpp"
-#include "contraction_cpu_reference.hpp"
+#include "contraction/contraction_cpu_reference.hpp"
 #include "contraction_test.hpp"
+#include "utils.hpp"
 
 namespace hiptensor
 {
diff --git a/test/01_contraction/contraction_test_params.hpp b/test/01_contraction/contraction_test_params.hpp
index 30c99349..29c4aa1b 100644
--- a/test/01_contraction/contraction_test_params.hpp
+++ b/test/01_contraction/contraction_test_params.hpp
@@ -33,7 +33,7 @@
 #include <hiptensor/hiptensor.hpp>
 #include <hiptensor/hiptensor_types.hpp>
 
-#include "../utils.hpp"
+#include "utils.hpp"
 
 namespace hiptensor
 {
diff --git a/test/02_permutation/CMakeLists.txt b/test/02_permutation/CMakeLists.txt
new file mode 100644
index 00000000..4334901c
--- /dev/null
+++ b/test/02_permutation/CMakeLists.txt
@@ -0,0 +1,35 @@
+###############################################################################
+ #
+ # MIT License
+ #
+ # Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ #
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
+ # of this software and associated documentation files (the "Software"), to deal
+ # in the Software without restriction, including without limitation the rights
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ # copies of the Software, and to permit persons to whom the Software is
+ # furnished to do so, subject to the following conditions:
+ #
+ # The above copyright notice and this permission notice shall be included in
+ # all copies or substantial portions of the Software.
+ #
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ # THE SOFTWARE.
+ #
+ ###############################################################################
+set(PermutationCommonSources ${HIPTENSOR_COMMON_TEST_SOURCES}
+                             ${CMAKE_CURRENT_SOURCE_DIR}/permutation_resource.cpp
+                             ${CMAKE_CURRENT_SOURCE_DIR}/permutation_test.cpp)
+
+#  tests
+set (PermutationTestSources ${PermutationCommonSources}
+                                    ${CMAKE_CURRENT_SOURCE_DIR}/permutation_column_major_test.cpp)
+set (PermutationTestConfig  ${CMAKE_CURRENT_SOURCE_DIR}/configs/test_params.yaml)
+add_hiptensor_test(permutation_test ${PermutationTestConfig}  ${PermutationTestSources})
+
diff --git a/test/02_permutation/configs/test_params.yaml b/test/02_permutation/configs/test_params.yaml
new file mode 100644
index 00000000..f6aa92e2
--- /dev/null
+++ b/test/02_permutation/configs/test_params.yaml
@@ -0,0 +1,41 @@
+---
+Log Level:       [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ]
+Tensor Data Types:
+  - [ HIP_R_32F, HIP_R_32F]
+  - [ HIP_R_16F, HIP_R_16F]
+  - [ HIP_R_16F, HIP_R_32F]
+Alphas:
+  - 0.0
+  - 1.0
+  - 2.3
+Lengths:
+  - [ 1, 1, 1, 1]
+  - [ 5, 2, 3, 4]
+  - [ 5, 2, 1, 1]
+  - [ 15, 12, 23, 11]
+Permuted Dims:
+  - [0, 1, 2, 3]
+  - [0, 1, 3, 2]
+  - [0, 2, 1, 3]
+  - [0, 2, 3, 1]
+  - [0, 3, 1, 2]
+  - [0, 3, 2, 1]
+  - [1, 0, 2, 3]
+  - [1, 0, 3, 2]
+  - [1, 2, 0, 3]
+  - [1, 2, 3, 0]
+  - [1, 3, 0, 2]
+  - [1, 3, 2, 0]
+  - [2, 0, 1, 3]
+  - [2, 0, 3, 1]
+  - [2, 1, 0, 3]
+  - [2, 1, 3, 0]
+  - [2, 3, 0, 1]
+  - [2, 3, 1, 0]
+  - [3, 0, 1, 2]
+  - [3, 0, 2, 1]
+  - [3, 1, 0, 2]
+  - [3, 1, 2, 0]
+  - [3, 2, 0, 1]
+  - [3, 2, 1, 0]
+...
diff --git a/test/singleton.hpp b/test/02_permutation/permutation_column_major_test.cpp
similarity index 74%
rename from test/singleton.hpp
rename to test/02_permutation/permutation_column_major_test.cpp
index eedf37c1..9a4ace70 100644
--- a/test/singleton.hpp
+++ b/test/02_permutation/permutation_column_major_test.cpp
@@ -24,25 +24,25 @@
  *
  *******************************************************************************/
 
-#ifndef HIPTENSOR_TEST_SINGLETON_HPP
-#define HIPTENSOR_TEST_SINGLETON_HPP
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
 
-#include <memory>
+#include "permutation_test.hpp"
+#include "permutation_test_helpers.hpp"
 
-namespace hiptensor
+class PermutationTest : public hiptensor::PermutationTest
 {
+};
 
-    template <typename T>
-    class LazySingleton
+TEST_P(PermutationTest, RunKernel)
+{
+    static bool ranWarmup = false;
+    if(!ranWarmup)
     {
-    public:
-        static inline std::unique_ptr<T> const& instance()
-        {
-            static auto sInstance = std::make_unique<T>();
-            return sInstance;
-        }
-    };
-
-} // namespace hiptensor
+        this->Warmup();
+        ranWarmup = true;
+    }
+    this->RunKernel();
+}
 
-#endif // HIPTENSOR_TEST_SINGLETON_HPP
+INSTANTIATE_TEST_SUITE_P(PermutationTests, PermutationTest, load_config_helper());
diff --git a/test/02_permutation/permutation_resource.cpp b/test/02_permutation/permutation_resource.cpp
new file mode 100644
index 00000000..4323e01d
--- /dev/null
+++ b/test/02_permutation/permutation_resource.cpp
@@ -0,0 +1,161 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_PERMUTATION_RESOURCE_IMPL_HPP
+#define HIPTENSOR_PERMUTATION_RESOURCE_IMPL_HPP
+
+#include "permutation_resource.hpp"
+#include "types.hpp"
+#include "utils.hpp"
+
+namespace hiptensor
+{
+
+    PermutationResource::PermutationResource()
+        : HipResource()
+        , mDeviceA(Base::allocDevice(0))
+        , mDeviceB(Base::allocDevice(0))
+        , mHostA(Base::allocHost(0))
+        , mHostB(Base::allocHost(0))
+        , mCurrentMatrixElement(0)
+        , mCurrentDataType(HIP_R_32F)
+        , mCurrentAllocByte(0)
+    {
+    }
+
+    PermutationResource::PermutationResource(PermutationResource&& rhs)
+        : HipResource()
+        , mDeviceA(std::move(rhs.mDeviceA))
+        , mDeviceB(std::move(rhs.mDeviceB))
+        , mHostA(std::move(rhs.mHostA))
+        , mHostB(std::move(rhs.mHostB))
+        , mCurrentMatrixElement(rhs.mCurrentMatrixElement)
+        , mCurrentDataType(rhs.mCurrentDataType)
+        , mCurrentAllocByte(rhs.mCurrentAllocByte)
+    {
+    }
+
+    void PermutationResource::setupStorage(ProblemDims const& dimSizes, hipDataType dataType)
+    {
+        auto requiredElementCount = getProduct(dimSizes);
+        auto requiredMemorySize   = requiredElementCount * hipDataTypeSize(dataType);
+
+        bool needFillData = false;
+        if(requiredMemorySize > mCurrentAllocByte)
+        {
+            Base::reallocDeviceHostPair(mDeviceA, mHostA, requiredMemorySize);
+            Base::reallocDeviceHostPair(mDeviceB, mHostB, requiredMemorySize);
+            Base::reallocDeviceHostPair(mDeviceReference, mHostReference, requiredMemorySize);
+            mCurrentAllocByte = requiredMemorySize;
+            needFillData      = true;
+        }
+        else if(mCurrentDataType != dataType)
+        {
+            needFillData = true;
+        }
+        mCurrentMatrixElement = requiredElementCount;
+        mCurrentDataType      = dataType;
+        if(needFillData)
+        {
+            fillRandToA();
+        }
+    }
+
+    void PermutationResource::reset()
+    {
+        Base::reallocDeviceHostPair(mDeviceA, mHostA, 0);
+        Base::reallocDeviceHostPair(mDeviceB, mHostB, 0);
+        Base::reallocDeviceHostPair(mDeviceReference, mHostReference, 0);
+        mCurrentMatrixElement = 0;
+        mCurrentDataType      = HIP_R_32F;
+        mCurrentAllocByte     = 0;
+    }
+
+    void PermutationResource::fillRandToA()
+    {
+        if(mCurrentDataType == HIP_R_32F)
+        {
+            fillLaunchKernel<float>((float*)deviceA().get(), mCurrentMatrixElement);
+        }
+        else
+        {
+            fillLaunchKernel<_Float16>((_Float16*)deviceA().get(), mCurrentMatrixElement);
+        }
+        Base::copyData(hostA(), deviceA(), getCurrentMatrixMemorySize());
+    }
+
+    void PermutationResource::copyBToHost()
+    {
+        Base::copyData(hostB(), deviceB(), getCurrentMatrixMemorySize());
+    }
+
+    void PermutationResource::copyReferenceToDevice()
+    {
+        Base::copyData(deviceReference(), hostReference(), getCurrentMatrixMemorySize());
+    }
+
+    size_t PermutationResource::getCurrentMatrixElement() const
+    {
+        return mCurrentMatrixElement;
+    }
+
+    size_t PermutationResource::getCurrentMatrixMemorySize() const
+    {
+        return mCurrentMatrixElement * hipDataTypeSize(mCurrentDataType);
+    }
+
+    auto PermutationResource::hostA() -> HostPtrT&
+    {
+        return mHostA;
+    }
+
+    auto PermutationResource::hostB() -> HostPtrT&
+    {
+        return mHostB;
+    }
+
+    auto PermutationResource::hostReference() -> HostPtrT&
+    {
+        return mHostReference;
+    }
+
+    auto PermutationResource::deviceA() -> DevicePtrT&
+    {
+        return mDeviceA;
+    }
+
+    auto PermutationResource::deviceB() -> DevicePtrT&
+    {
+        return mDeviceB;
+    }
+
+    auto PermutationResource::deviceReference() -> DevicePtrT&
+    {
+        return mDeviceReference;
+    }
+} // namespace hiptensor
+
+#endif // HIPTENSOR_PERMUTATION_RESOURCE_IMPL_HPP
diff --git a/test/02_permutation/permutation_resource.hpp b/test/02_permutation/permutation_resource.hpp
new file mode 100644
index 00000000..2e0d77ef
--- /dev/null
+++ b/test/02_permutation/permutation_resource.hpp
@@ -0,0 +1,104 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_PERMUTATION_RESOURCE_HPP
+#define HIPTENSOR_PERMUTATION_RESOURCE_HPP
+
+#include <memory>
+#include <tuple>
+
+#include "hip_resource.hpp"
+#include "singleton.hpp"
+
+// PermutationResource class is intended to manage a shared pool of resources for
+// testing hiptensor contraction kernels on the GPU.
+//
+// It minimizes the memory handling overhead for launching thousands of GPU
+// kernels by allowing re-use of existing memory allocations. Memory is only
+// re-allocated as necessary to satisfy minimum size requirements.
+//
+// The interface indicates memory ownership by this class and shall only be
+// used to access for read/write purposes.
+//
+// Currently uses HIP as the backend for device allocation.
+
+namespace hiptensor
+{
+
+    struct PermutationResource : public HipResource, public LazySingleton<PermutationResource>
+    {
+        // For static initialization
+        friend std::unique_ptr<PermutationResource> std::make_unique<PermutationResource>();
+
+        using Base = HipResource;
+
+    public:
+        using DevicePtrT = Base::DevicePtrT;
+        using HostPtrT   = Base::HostPtrT;
+
+        // N, C, W, H
+        using ProblemDims = std::vector<std::size_t>;
+
+    private: // No public instantiation except make_unique.
+             // No copy
+        PermutationResource();
+        PermutationResource(const PermutationResource&)            = delete;
+        PermutationResource& operator=(const PermutationResource&) = delete;
+
+    public:
+        PermutationResource(PermutationResource&&);
+        virtual ~PermutationResource() = default;
+
+        void setupStorage(ProblemDims const& dimSizes, hipDataType dataType);
+        void fillRandToA();
+        void copyBToHost();
+        void copyReferenceToDevice();
+
+        HostPtrT& hostA();
+        HostPtrT& hostB();
+        HostPtrT& hostReference();
+
+        DevicePtrT& deviceA();
+        DevicePtrT& deviceB();
+        DevicePtrT& deviceReference();
+
+        size_t getCurrentMatrixElement() const;
+        size_t getCurrentMatrixMemorySize() const;
+        void   reset() final;
+
+    protected:
+        DevicePtrT mDeviceA, mDeviceB, mDeviceReference;
+        HostPtrT   mHostA, mHostB, mHostReference;
+
+        size_t mCurrentMatrixElement; /**< Element count of A/B */
+        hipDataType
+            mCurrentDataType; /**< Type size of element of A/B, only support HIP_R_16F, HIP_R_32F */
+        size_t mCurrentAllocByte; /**< Allocated size of memory */
+    };
+
+} // namespace hiptensor
+
+#endif // HIPTENSOR_PERMUTATION_RESOURCE_HPP
diff --git a/test/02_permutation/permutation_test.cpp b/test/02_permutation/permutation_test.cpp
new file mode 100644
index 00000000..dbb52d6e
--- /dev/null
+++ b/test/02_permutation/permutation_test.cpp
@@ -0,0 +1,306 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <hiptensor/hiptensor.hpp>
+
+#include "logger.hpp"
+#include "permutation/permutation_cpu_reference.hpp"
+#include "permutation_test.hpp"
+#include "types.hpp"
+#include "utils.hpp"
+#include "llvm/hiptensor_options.hpp"
+
+namespace hiptensor
+{
+    /*static*/ std::stringstream PermutationTest::sAPILogBuff = std::stringstream();
+
+    static void logMessage(int32_t logLevel, const char* funcName /*=""*/, const char* msg /*=""*/)
+    {
+        PermutationTest::sAPILogBuff << msg;
+    }
+
+    PermutationTest::PermutationTest()
+        : Base()
+    {
+        reset();
+
+        // Handle our own outputs
+        hiptensorLoggerOpenFile("/dev/null");
+        hiptensorLoggerSetCallback(logMessage);
+    }
+
+    // Kernel run checks. Virtual as different Permutation kernels have different requirements
+    // True = run test
+    // False = skip test
+    bool PermutationTest::checkDevice(hipDataType datatype) const
+    {
+        return (isF32Supported() && datatype == HIP_R_32F)
+               || (isF64Supported() && datatype == HIP_R_64F);
+    }
+
+    bool PermutationTest::checkSizes() const
+    {
+        return true;
+    }
+
+    void PermutationTest::reset()
+    {
+        handle = nullptr;
+
+        mRepeats          = 1u;
+        mRunFlag          = true;
+        mValidationResult = false;
+        mMaxRelativeError = 0.0;
+    }
+
+    PermutationResource* PermutationTest::getResource() const
+    {
+        return DataStorage::instance().get();
+    }
+
+    void PermutationTest::SetUp()
+    {
+        // reset API log buffer
+        sAPILogBuff.str(std::string());
+
+        auto param        = Base::GetParam();
+        auto testType     = std::get<0>(param);
+        auto logLevel     = std::get<1>(param);
+        auto lengths      = std::get<2>(param);
+        auto permutedDims = std::get<3>(param);
+        auto alpha        = std::get<4>(param);
+
+        // 4D tensors only at the moment.
+        EXPECT_EQ(lengths.size(), 4); // Format {'n', 'c', 'w', 'h'}
+        EXPECT_EQ(permutedDims.size(), 4); // permutation of {0, 1, 2, 3}
+
+        EXPECT_EQ(testType.size(), 2); // HIP_R_16F or HIP_R_32F
+        auto abDataType = testType[0];
+        EXPECT_TRUE((abDataType == HIP_R_16F) || (abDataType == HIP_R_32F));
+
+        getResource()->setupStorage(lengths, abDataType);
+
+        // set mPrintElements to true to print element
+        mPrintElements = false;
+    }
+
+    void PermutationTest::reportResults(std::ostream& stream,
+                                        hipDataType   dataType,
+                                        bool          omitSkipped,
+                                        bool          omitFailed,
+                                        bool          omitPassed) const
+    {
+        // Conditionally print outputs
+        if((mRunFlag || !omitSkipped) && (mValidationResult || !omitFailed)
+           && (!mValidationResult || !omitPassed))
+        {
+            stream << PermutationTest::sAPILogBuff.str();
+
+            if(mPrintElements)
+            {
+                auto resource = getResource();
+
+                size_t elementsA = resource->getCurrentMatrixElement();
+                size_t elementsB = elementsA;
+
+                if(dataType == HIP_R_32F)
+                {
+                    stream << "Tensor A elements (" << elementsA << "):\n";
+                    hiptensorPrintArrayElements<float>(
+                        stream, (float*)resource->hostA().get(), elementsA);
+                    stream << std::endl;
+
+                    stream << "Tensor B elements (" << elementsB << "):\n";
+                    hiptensorPrintArrayElements<float>(
+                        stream, (float*)resource->hostB().get(), elementsB);
+                    stream << std::endl;
+                }
+                else
+                {
+                    stream << "Tensor A elements (" << elementsA << "):\n";
+                    hiptensorPrintArrayElements<_Float16>(
+                        stream, (_Float16*)resource->hostA().get(), elementsA);
+                    stream << std::endl;
+
+                    stream << "Tensor B elements (" << elementsB << "):\n";
+                    hiptensorPrintArrayElements<_Float16>(
+                        stream, (_Float16*)resource->hostB().get(), elementsB);
+                    stream << std::endl;
+                }
+            }
+        }
+    }
+
+    void PermutationTest::RunKernel()
+    {
+        auto param        = Base::GetParam();
+        auto testType     = std::get<0>(param);
+        auto logLevel     = std::get<1>(param);
+        auto lengths      = std::get<2>(param);
+        auto permutedDims = std::get<3>(param);
+        auto alpha        = std::get<4>(param);
+
+        auto abDataType      = testType[0];
+        auto computeDataType = testType[1];
+
+        if(!mRunFlag)
+        {
+            GTEST_SKIP();
+        }
+        auto resource = getResource();
+
+        if(mRunFlag)
+        {
+            /**********************
+              B_{w, h, c, n} = 1.0 *  \textsl{IDENTITY}(A_{c, n, h, w})
+             **********************/
+
+            std::vector<int> modeA{'n', 'c', 'w', 'h'};
+            std::vector<int> modeB;
+            for(auto dim : permutedDims)
+            {
+                modeB.push_back(modeA[dim]);
+            }
+
+            int                              nmodeA = modeA.size();
+            int                              nmodeB = modeB.size();
+            std::unordered_map<int, int64_t> extent;
+            for(auto [modeIt, i] = std::tuple{modeA.begin(), 0}; modeIt != modeA.end();
+                ++modeIt, ++i)
+            {
+                extent[*modeIt] = lengths[i];
+            }
+
+            std::vector<int64_t> extentA;
+            for(auto mode : modeA)
+                extentA.push_back(extent[mode]);
+            std::vector<int64_t> extentB;
+            for(auto mode : modeB)
+                extentB.push_back(extent[mode]);
+
+            hiptensorStatus_t  err;
+            hiptensorHandle_t* handle;
+            CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
+
+            hiptensorTensorDescriptor_t descA;
+            CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                                &descA,
+                                                                nmodeA,
+                                                                extentA.data(),
+                                                                NULL /* stride */,
+                                                                abDataType,
+                                                                HIPTENSOR_OP_IDENTITY));
+
+            hiptensorTensorDescriptor_t descB;
+            CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                                &descB,
+                                                                nmodeB,
+                                                                extentB.data(),
+                                                                NULL /* stride */,
+                                                                abDataType,
+                                                                HIPTENSOR_OP_IDENTITY));
+
+            float alphaValue{};
+            if(computeDataType == HIP_R_16F)
+            {
+                *(reinterpret_cast<_Float16*>(&alphaValue)) = static_cast<_Float16>(alpha);
+            }
+            else
+            {
+                *(reinterpret_cast<float*>(&alphaValue)) = static_cast<float>(alpha);
+            }
+            CHECK_HIPTENSOR_ERROR(hiptensorPermutation(handle,
+                                                       &alphaValue,
+                                                       resource->deviceA().get(),
+                                                       &descA,
+                                                       modeA.data(),
+                                                       resource->deviceB().get(),
+                                                       &descB,
+                                                       modeB.data(),
+                                                       computeDataType,
+                                                       0 /* stream */));
+            resource->copyBToHost();
+
+            if(abDataType == HIP_R_32F)
+            {
+                hiptensor::detail::permuteByCpu(&alphaValue,
+                                                (const float*)resource->hostA().get(),
+                                                &descA,
+                                                modeA.data(),
+                                                (float*)resource->hostReference().get(),
+                                                &descB,
+                                                modeB.data(),
+                                                computeDataType);
+                resource->copyReferenceToDevice();
+                std::tie(mValidationResult, mMaxRelativeError)
+                    = compareEqualLaunchKernel<float>((float*)resource->deviceB().get(),
+                                                      (float*)resource->deviceReference().get(),
+                                                      resource->getCurrentMatrixElement());
+            }
+            else if(abDataType == HIP_R_16F)
+            {
+                hiptensor::detail::permuteByCpu(&alphaValue,
+                                                (const _Float16*)resource->hostA().get(),
+                                                &descA,
+                                                modeA.data(),
+                                                (_Float16*)resource->hostReference().get(),
+                                                &descB,
+                                                modeB.data(),
+                                                computeDataType);
+                resource->copyReferenceToDevice();
+                std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<_Float16>(
+                    (_Float16*)resource->deviceB().get(),
+                    (_Float16*)resource->deviceReference().get(),
+                    resource->getCurrentMatrixElement());
+            }
+        }
+
+        EXPECT_TRUE(mValidationResult) << "Max relative error: " << mMaxRelativeError;
+
+        using Options        = hiptensor::HiptensorOptions;
+        auto& loggingOptions = Options::instance();
+
+        if(!loggingOptions->omitCout())
+        {
+            reportResults(std::cout,
+                          abDataType,
+                          loggingOptions->omitSkipped(),
+                          loggingOptions->omitFailed(),
+                          loggingOptions->omitPassed());
+        }
+
+        if(loggingOptions->ostream().isOpen())
+        {
+            reportResults(loggingOptions->ostream().fstream(),
+                          abDataType,
+                          loggingOptions->omitSkipped(),
+                          loggingOptions->omitFailed(),
+                          loggingOptions->omitPassed());
+        }
+    }
+
+    void PermutationTest::TearDown() {}
+
+} // namespace hiptensor
diff --git a/test/02_permutation/permutation_test.hpp b/test/02_permutation/permutation_test.hpp
new file mode 100644
index 00000000..9009d8ea
--- /dev/null
+++ b/test/02_permutation/permutation_test.hpp
@@ -0,0 +1,105 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_PERMUTATION_TEST_HPP
+#define HIPTENSOR_PERMUTATION_TEST_HPP
+
+#include <sstream>
+
+#include <hiptensor/hiptensor_types.hpp>
+
+#include "permutation_resource.hpp"
+#include "permutation_test_params.hpp"
+
+#include <gtest/gtest.h>
+
+namespace hiptensor
+{
+    static void logMessage(int32_t logLevel, const char* funcName = "", const char* msg = "");
+
+    using PermutationTestParams_t = std::tuple<typename PermutationTestParams::TestTypesT,
+                                               typename PermutationTestParams::LogLevelT,
+                                               typename PermutationTestParams::LengthsT,
+                                               typename PermutationTestParams::PermutedDimsT,
+                                               typename PermutationTestParams::AlphaT>;
+    class PermutationTest : public ::testing::TestWithParam<PermutationTestParams_t>
+    {
+    protected: // Types
+        using Base = ::testing::TestWithParam<PermutationTestParams_t>;
+
+        // Shared access to Permutation storage
+        using DataStorage = PermutationResource;
+
+        friend void logMessage(int32_t, const char*, const char*);
+
+    public:
+        PermutationTest();
+        virtual ~PermutationTest() = default;
+
+    protected: // Functions
+        PermutationTest(PermutationTest&&)            = delete;
+        PermutationTest(PermutationTest const&)       = delete;
+        PermutationTest& operator=(PermutationTest&)  = delete;
+        PermutationTest& operator=(PermutationTest&&) = delete;
+
+        bool checkDevice(hipDataType datatype) const;
+        bool checkSizes() const;
+        void reset();
+
+        PermutationResource* getResource() const;
+
+        void SetUp() final;
+        void TearDown() final;
+
+        void Warmup() {}
+        void RunKernel();
+
+        void reportResults(std::ostream& stream,
+                           hipDataType   DDataType,
+                           bool          omitSkipped,
+                           bool          omitFailed,
+                           bool          omitPassed) const;
+
+    protected:
+        // Workspace items
+        hiptensorHandle_t* handle = nullptr;
+
+        hiptensorTensorDescriptor_t a_ms_ks, b_ns_ks, c_ms_ns, d_ms_ns;
+
+        // Execution flow control
+        uint32_t mRepeats;
+        bool     mRunFlag          = true;
+        bool     mValidationResult = false;
+        bool     mPrintElements    = false;
+        double   mMaxRelativeError;
+
+        // Output buffer
+        static std::stringstream sAPILogBuff;
+    };
+
+} // namespace hiptensor
+
+#endif // HIPTENSOR_PERMUTATION_TEST_HPP
diff --git a/test/02_permutation/permutation_test_helpers.hpp b/test/02_permutation/permutation_test_helpers.hpp
new file mode 100644
index 00000000..ac34a21a
--- /dev/null
+++ b/test/02_permutation/permutation_test_helpers.hpp
@@ -0,0 +1,68 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_PERMUTATION_TEST_HELPERS_HPP
+#define HIPTENSOR_PERMUTATION_TEST_HELPERS_HPP
+
+#include <gtest/gtest.h>
+
+#include "llvm/hiptensor_options.hpp"
+#include "llvm/yaml_parser.hpp"
+
+#ifdef HIPTENSOR_TEST_YAML_INCLUDE
+#include HIPTENSOR_TEST_YAML_INCLUDE
+#define HIPTENSOR_TEST_YAML_BUNDLE 1
+#else
+#define HIPTENSOR_TEST_YAML_BUNDLE 0
+#endif // HIPTENSOR_TEST_YAML_INCLUDE
+
+auto inline load_config_helper()
+{
+    hiptensor::PermutationTestParams testParams;
+    using Options     = hiptensor::HiptensorOptions;
+    auto& testOptions = Options::instance();
+
+    if(testOptions->usingDefaultConfig() && HIPTENSOR_TEST_YAML_BUNDLE)
+    {
+        testParams = hiptensor::YamlConfigLoader<hiptensor::PermutationTestParams>::loadFromString(
+            HIPTENSOR_TEST_GET_YAML);
+    }
+    else
+    {
+        testParams = hiptensor::YamlConfigLoader<hiptensor::PermutationTestParams>::loadFromFile(
+            testOptions->inputFilename());
+    }
+
+    // testParams.printParams();
+
+    return ::testing::Combine(::testing::ValuesIn(testParams.dataTypes()),
+                              ::testing::Values(testParams.logLevelMask()),
+                              ::testing::ValuesIn(testParams.problemLengths()),
+                              ::testing::ValuesIn(testParams.permutedDims()),
+                              ::testing::ValuesIn(testParams.alphas()));
+}
+
+#endif // HIPTENSOR_PERMUTATION_TEST_HELPERS_HPP
diff --git a/test/02_permutation/permutation_test_params.hpp b/test/02_permutation/permutation_test_params.hpp
new file mode 100644
index 00000000..f526ad47
--- /dev/null
+++ b/test/02_permutation/permutation_test_params.hpp
@@ -0,0 +1,95 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_PERMUTATION_TEST_PARAMS_HPP
+#define HIPTENSOR_PERMUTATION_TEST_PARAMS_HPP
+
+#include <tuple>
+#include <vector>
+
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
+
+#include "utils.hpp"
+
+namespace hiptensor
+{
+
+    struct PermutationTestParams
+    {
+        using TestTypesT    = std::vector<hipDataType>;
+        using LogLevelT     = hiptensorLogLevel_t;
+        using LengthsT      = std::vector<std::size_t>;
+        using AlphaT        = double;
+        using PermutedDimsT = std::vector<std::size_t>;
+
+    public:
+        std::vector<TestTypesT>& dataTypes()
+        {
+            return mDataTypes;
+        }
+
+        LogLevelT& logLevelMask()
+        {
+            return mLogLevelMask;
+        }
+
+        std::vector<LengthsT>& problemLengths()
+        {
+            return mProblemLengths;
+        }
+
+        std::vector<PermutedDimsT>& permutedDims()
+        {
+            return mPermutedDims;
+        }
+
+        std::vector<AlphaT>& alphas()
+        {
+            return mAlphas;
+        }
+
+        void printParams()
+        {
+            std::cout << "DataTypes: " << mDataTypes << "\n"
+                      << "LogLevelMask: " << mLogLevelMask << "\n"
+                      << "ProblemLengths: " << mProblemLengths << "\n"
+                      << "Alphas: " << mAlphas << "\n"
+                      << "PermutedDims: " << mPermutedDims << "\n";
+        }
+
+    private:
+        //Data types of input and output tensors
+        std::vector<TestTypesT>    mDataTypes;
+        LogLevelT                  mLogLevelMask;
+        std::vector<LengthsT>      mProblemLengths;
+        std::vector<AlphaT>        mAlphas;
+        std::vector<PermutedDimsT> mPermutedDims;
+    };
+
+} // namespace hiptensor
+
+#endif // HIPTENSOR_PERMUTATION_TEST_PARAMS_HPP
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e9f791cd..06967a09 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -101,7 +101,8 @@ function(add_hiptensor_test BINARY_NAME YAML_CONFIG_FILE TEST_SOURCES)
     target_include_directories(${BINARY_NAME} PRIVATE
                                ${CMAKE_CURRENT_SOURCE_DIR}
                                ${PROJECT_SOURCE_DIR}/library/include
-                               ${PROJECT_SOURCE_DIR}/library/src/contraction
+                               ${PROJECT_SOURCE_DIR}/library/src/include
+                               ${PROJECT_SOURCE_DIR}/library/src
                                ${PROJECT_SOURCE_DIR}/test
                                ${hiptensor_test_YAML_DIR})
 
@@ -179,6 +180,7 @@ endfunction()
 
 add_subdirectory(00_unit)
 add_subdirectory(01_contraction)
+add_subdirectory(02_permutation)
 
 rocm_install(
     FILES "${INSTALL_TEST_FILE}"
diff --git a/test/device/common.hpp b/test/device/common.hpp
index 8ad75dd4..f961abc1 100644
--- a/test/device/common.hpp
+++ b/test/device/common.hpp
@@ -66,7 +66,7 @@ __global__ static void
 
 // fill kernel for 'elementSize' elements
 template <typename DataType>
-__global__ void fillKernel(DataType* data, uint32_t elementSize)
+__global__ void fillKernel(DataType* data, uint32_t elementSize, uint32_t seed)
 {
     uint32_t index = (blockIdx.x * blockDim.x + threadIdx.x);
 
diff --git a/test/llvm/CMakeLists.txt b/test/llvm/CMakeLists.txt
index 064746bb..6708ae2e 100644
--- a/test/llvm/CMakeLists.txt
+++ b/test/llvm/CMakeLists.txt
@@ -56,11 +56,13 @@ set(HIPTENSOR_LLVM_LIBS "-L${LLVM_LIBRARY_DIR}"
 # Includes
 set(HIPTENSOR_LLVM_INCLUDES ${LLVM_INCLUDE_DIRS}
                             ${PROJECT_SOURCE_DIR}/library/include
+                            ${PROJECT_SOURCE_DIR}/library/src/include
+                            ${PROJECT_SOURCE_DIR}/library/src
                             ${PROJECT_SOURCE_DIR}/test
                             )
 
 # Sources for this static object
-set(HIPTENSOR_LLVM_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/yaml_parser_contraction_config.cpp
+set(HIPTENSOR_LLVM_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/yaml_parser_config.cpp
                            ${CMAKE_CURRENT_SOURCE_DIR}/hiptensor_options.cpp)
 
 # Create hiptensor_llvm object target
diff --git a/test/llvm/yaml_parser_contraction_config.cpp b/test/llvm/yaml_parser_config.cpp
similarity index 85%
rename from test/llvm/yaml_parser_contraction_config.cpp
rename to test/llvm/yaml_parser_config.cpp
index 6982eeea..46f4c43e 100644
--- a/test/llvm/yaml_parser_contraction_config.cpp
+++ b/test/llvm/yaml_parser_config.cpp
@@ -31,6 +31,7 @@
 #include <hiptensor/hiptensor.hpp>
 
 #include "01_contraction/contraction_test_params.hpp"
+#include "02_permutation/permutation_test_params.hpp"
 #include "yaml_parser_impl.hpp"
 
 // Fwd declare NoneType
@@ -108,6 +109,7 @@ namespace llvm
         {
             static void enumeration(IO& io, hipDataType& value)
             {
+                io.enumCase(value, "HIP_R_16F", HIP_R_16F);
                 io.enumCase(value, "HIP_R_32F", HIP_R_32F);
                 io.enumCase(value, "HIP_R_64F", HIP_R_64F);
                 io.enumCase(value, "NONE_TYPE", hiptensor::NONE_TYPE);
@@ -209,7 +211,7 @@ namespace llvm
         };
 
         ///
-        // Mapping of the test param elements for reading / writing.
+        // Mapping of the test param elements of ContractionTestParams for reading / writing.
         ///
         template <>
         struct MappingTraits<hiptensor::ContractionTestParams>
@@ -269,12 +271,53 @@ namespace llvm
             }
         };
 
+        ///
+        // Mapping of the test param elements of PermutationTestParams for reading / writing.
+        ///
+        template <>
+        struct MappingTraits<hiptensor::PermutationTestParams>
+        {
+            static void mapping(IO& io, hiptensor::PermutationTestParams& doc)
+            {
+                // Logging bitfield
+                io.mapRequired("Log Level", doc.logLevelMask());
+
+                // Sequences of combinatorial fields
+                io.mapRequired("Tensor Data Types", doc.dataTypes());
+                io.mapRequired("Alphas", (std::vector<AlphaT>&)(doc.alphas()));
+                io.mapRequired("Lengths", doc.problemLengths());
+                io.mapRequired("Permuted Dims", doc.permutedDims());
+            }
+
+            // Additional validation for input / output of the config
+            static std::string validate(IO& io, hiptensor::PermutationTestParams& doc)
+            {
+                if(doc.problemLengths().size() == 0)
+                {
+                    return "Error: Empty Lengths";
+                }
+
+                if(doc.alphas().size() == 0)
+                {
+                    return "Error: Empty Alphas";
+                }
+
+                if(doc.permutedDims().size() == 0)
+                {
+                    return "Error: Empty Permuted Dims";
+                }
+
+                return std::string{};
+            }
+        };
+
     } // namespace yaml
 
 } // namespace llvm
 
-// Instantiate the yaml loader for the ContractionTestParams
+// Instantiate the yaml loader for the ContractionTestParams and PermutationTestParams
 namespace hiptensor
 {
     template struct YamlConfigLoader<ContractionTestParams>;
+    template struct YamlConfigLoader<PermutationTestParams>;
 }
diff --git a/test/utils.hpp b/test/utils.hpp
index 421c1a04..67418568 100644
--- a/test/utils.hpp
+++ b/test/utils.hpp
@@ -28,6 +28,7 @@
 #define HIPTENSOR_TEST_UTILS_HPP
 
 #include <algorithm>
+#include <ctime>
 #include <fstream>
 #include <iterator>
 #include <math.h>
@@ -49,10 +50,10 @@
         CHECK_HIP_ERROR(hipFree(ptr)); \
     }
 
-#define HIPTENSOR_FREE_HOST(ptr) \
-    if(ptr != nullptr)           \
-    {                            \
-        free(ptr);               \
+#define HIPTENSOR_FREE_HOST(ptr)           \
+    if(ptr != nullptr)                     \
+    {                                      \
+        CHECK_HIP_ERROR(hipHostFree(ptr)); \
     }
 
 inline bool isF32Supported()
@@ -97,13 +98,30 @@ static constexpr intT1 ceilDiv(const intT1 numerator, const intT2 divisor)
     return (numerator + divisor - 1) / divisor;
 }
 
+template <typename Container>
+auto getProduct(const Container&               container,
+                typename Container::value_type init = typename Container::value_type{1})
+{
+    return std::accumulate(std::begin(container),
+                           std::end(container),
+                           init,
+                           std::multiplies<typename Container::value_type>{});
+}
+
 // fill kernel for 'elementSize' elements
 template <typename DataType>
 __host__ static inline void fillLaunchKernel(DataType* data, uint32_t elementSize)
 {
     auto blockDim = dim3(1024, 1, 1);
     auto gridDim  = dim3(ceilDiv(elementSize, blockDim.x), 1, 1);
-    hipLaunchKernelGGL((fillKernel<DataType>), gridDim, blockDim, 0, 0, data, elementSize);
+    hipLaunchKernelGGL((fillKernel<DataType>),
+                       gridDim,
+                       blockDim,
+                       0,
+                       0,
+                       data,
+                       elementSize,
+                       static_cast<uint32_t>(std::time(nullptr)));
 }
 
 // fill kernel wrapper for 'elementSize' elements with a specific value
@@ -191,6 +209,19 @@ std::pair<bool, double> compareEqual(DDataType const* deviceD,
     return std::make_pair(retval, max_relative_error);
 }
 
+template <typename DDataType>
+double getEpsilon()
+{
+    if(std::is_same_v<DDataType, _Float16>)
+    {
+        return 0.0009765625; // numeric_limits<_Float16>::epsilon() => 0
+    }
+    else
+    {
+        return std::numeric_limits<DDataType>::epsilon();
+    }
+};
+
 template <typename DDataType>
 std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
                                                  DDataType*  hostD,
@@ -257,7 +288,7 @@ std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
     auto toDouble
         = [](DDataType const& val) { return static_cast<double>(static_cast<float>(val)); };
 
-    auto eps = toDouble(std::numeric_limits<DDataType>::epsilon());
+    auto eps = getEpsilon<DDataType>();
     if(isNaN)
     {
         retval           = false;
@@ -276,6 +307,7 @@ namespace std
     template <typename T>
     ostream& operator<<(ostream& os, const std::vector<T>& vec)
     {
+        os << "[ ";
         for(auto i = 0; i < vec.size(); i++)
         {
             if(i < vec.size() - 1)
@@ -287,6 +319,7 @@ namespace std
                 os << vec[i];
             }
         }
+        os << " ]";
 
         return os;
     }

From 082a9662a46135fb083ce2028a55d792a9852fdf Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Wed, 15 Nov 2023 19:02:38 +0000
Subject: [PATCH 4/5] Support both row_major and col_major data layout

Data layout of cuTenor is col_major by default. HipTensor contraction
only supports row_major for now. HipTensor supports both col_major
and row_major to make users be able to choose which layout they want to
use.

- Use CMake option `HIPTENSOR_DATA_LAYOUT_COL_MAJOR` to choose layout
---
 CMakeLists.txt                                |  8 ++
 .../src/contraction/hiptensor_contraction.cpp |  2 +-
 .../src/permutation/hiptensor_permutation.cpp |  2 +-
 ...mutation_ck_col.hpp => permutation_ck.hpp} |  2 +-
 ...k_col_impl.hpp => permutation_ck_impl.hpp} | 88 +++++++++++++++----
 .../permutation_cpu_reference_impl.hpp        | 22 ++++-
 samples/02_permutation/permutation.cpp        |  9 +-
 7 files changed, 106 insertions(+), 27 deletions(-)
 rename library/src/permutation/{permutation_ck_col.hpp => permutation_ck.hpp} (98%)
 rename library/src/permutation/{permutation_ck_col_impl.hpp => permutation_ck_impl.hpp} (64%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c88bd54f..6fd11fb8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,7 @@ set(CMAKE_CXX_EXTENSIONS OFF)
 if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
   option( HIPTENSOR_BUILD_TESTS "Build hiptensor tests" ON )
   option( HIPTENSOR_BUILD_SAMPLES "Build hiptensor samples" ON )
+  option( HIPTENSOR_DATA_LAYOUT_COL_MAJOR "Set hiptensor data layout to column major" ON )
 endif()
 
 # Setup output paths
@@ -93,6 +94,13 @@ else()
 endif()
 message( VERBOSE "AMDGPU_TARGETS=${AMDGPU_TARGETS}")
 
+if(HIPTENSOR_DATA_LAYOUT_COL_MAJOR)
+  add_compile_definitions(HIPTENSOR_DATA_LAYOUT_COL_MAJOR=1)
+else()
+  add_compile_definitions(HIPTENSOR_DATA_LAYOUT_COL_MAJOR=0)
+endif()
+message("-- HIPTENSOR_DATA_LAYOUT_COL_MAJOR=${HIPTENSOR_DATA_LAYOUT_COL_MAJOR}")
+
 # Setup HIP
 find_package(hip REQUIRED )
 message(STATUS "HIP version: ${hip_VERSION}")
diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp
index 078689ea..09f5ddf6 100644
--- a/library/src/contraction/hiptensor_contraction.cpp
+++ b/library/src/contraction/hiptensor_contraction.cpp
@@ -87,7 +87,7 @@ hiptensorStatus_t hiptensorInitContractionDescriptor(const hiptensorHandle_t*
     auto& logger = Logger::instance();
 
     // Log API access
-    char msg[1024];
+    char msg[2048];
     snprintf(
         msg,
         sizeof(msg),
diff --git a/library/src/permutation/hiptensor_permutation.cpp b/library/src/permutation/hiptensor_permutation.cpp
index b3d60379..2b092655 100644
--- a/library/src/permutation/hiptensor_permutation.cpp
+++ b/library/src/permutation/hiptensor_permutation.cpp
@@ -26,7 +26,7 @@
 #include <hiptensor/hiptensor.hpp>
 
 #include "logger.hpp"
-#include "permutation_ck_col.hpp"
+#include "permutation_ck.hpp"
 
 hiptensorStatus_t hiptensorPermutation(const hiptensorHandle_t*           handle,
                                        const void*                        alpha,
diff --git a/library/src/permutation/permutation_ck_col.hpp b/library/src/permutation/permutation_ck.hpp
similarity index 98%
rename from library/src/permutation/permutation_ck_col.hpp
rename to library/src/permutation/permutation_ck.hpp
index 600658c6..8fa4959c 100644
--- a/library/src/permutation/permutation_ck_col.hpp
+++ b/library/src/permutation/permutation_ck.hpp
@@ -44,5 +44,5 @@ namespace hiptensor
     }
 }
 
-#include "permutation_ck_col_impl.hpp"
+#include "permutation_ck_impl.hpp"
 #endif // HIPTENSOR_PERMUTATION_CK_COL_HPP
diff --git a/library/src/permutation/permutation_ck_col_impl.hpp b/library/src/permutation/permutation_ck_impl.hpp
similarity index 64%
rename from library/src/permutation/permutation_ck_col_impl.hpp
rename to library/src/permutation/permutation_ck_impl.hpp
index 7f16bb8a..7878e908 100644
--- a/library/src/permutation/permutation_ck_col_impl.hpp
+++ b/library/src/permutation/permutation_ck_impl.hpp
@@ -26,11 +26,13 @@
 #ifndef HIPTENSOR_PERMUTATION_CK_COL_IMPL_HPP
 #define HIPTENSOR_PERMUTATION_CK_COL_IMPL_HPP
 #include <cstdlib>
+#include <unordered_map>
 
 #include <ck/ck.hpp>
 #include <ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp>
 #include <ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp>
 
+#include "performance.hpp"
 #include "types.hpp"
 
 namespace hiptensor
@@ -74,35 +76,50 @@ namespace hiptensor
                 modeToLength[modeA[index]] = descA->mLengths[index];
             }
 
+            float                                alphaValue = readVal<float>(alpha, typeScalar);
+            std::array<const void*, 1>           input      = {A};
+            std::array<void*, 1>                 output     = {B};
             std::unordered_map<int32_t, int32_t> bModeToStrides;
-            int32_t                              stride = 1;
-            bModeToStrides[modeB[0]]                    = stride;
+#if HIPTENSOR_DATA_LAYOUT_COL_MAJOR
+            std::array<ck::index_t, 4> aStrides
+                = {1,
+                   modeToLength[modeA[0]],
+                   modeToLength[modeA[0]] * modeToLength[modeA[1]],
+                   modeToLength[modeA[0]] * modeToLength[modeA[1]] * modeToLength[modeA[2]]};
+            int32_t stride           = 1;
+            bModeToStrides[modeB[0]] = stride;
             for(int32_t index = 1; index < modeSize; index++)
             {
                 stride *= modeToLength[modeB[index - 1]];
                 bModeToStrides[modeB[index]] = stride;
             }
-
-            float                      alphaValue = readVal<float>(alpha, typeScalar);
-            std::array<const void*, 1> input      = {A};
-            std::array<void*, 1>       output     = {B};
-            std::array<ck::index_t, 4> a_strides
-                = {1,
-                   modeToLength[modeA[0]],
-                   modeToLength[modeA[0]] * modeToLength[modeA[1]],
-                   modeToLength[modeA[0]] * modeToLength[modeA[1]] * modeToLength[modeA[2]]};
-            std::array<ck::index_t, 4> b_strides        = {bModeToStrides[modeA[0]],
+#else // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
+            std::array<ck::index_t, 4> aStrides = {
+                modeToLength[modeA[1]] * modeToLength[modeA[2]] * modeToLength[modeA[3]],
+                modeToLength[modeA[2]] * modeToLength[modeA[3]],
+                modeToLength[modeA[3]],
+                1,
+            };
+            int32_t stride                      = 1;
+            bModeToStrides[modeB[modeSize - 1]] = stride;
+            for(int32_t index = modeSize - 2; index >= 0; index--)
+            {
+                stride *= modeToLength[modeB[index + 1]];
+                bModeToStrides[modeB[index]] = stride;
+            }
+#endif // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
+            std::array<ck::index_t, 4> bStrides         = {bModeToStrides[modeA[0]],
                                                            bModeToStrides[modeA[1]],
                                                            bModeToStrides[modeA[2]],
                                                            bModeToStrides[modeA[3]]};
-            std::array<ck::index_t, 4> ab_lengths       = {modeToLength[modeA[0]],
+            std::array<ck::index_t, 4> abLengths        = {modeToLength[modeA[0]],
                                                            modeToLength[modeA[1]],
                                                            modeToLength[modeA[2]],
                                                            modeToLength[modeA[3]]};
             auto                       broadcastPermute = DeviceElementwisePermuteInstance{};
-            auto                       argument = broadcastPermute.MakeArgumentPointer(ab_lengths,
-                                                                                       {a_strides},
-                                                                                       {b_strides},
+            auto                       argument = broadcastPermute.MakeArgumentPointer(abLengths,
+                                                                                       {aStrides},
+                                                                                       {bStrides},
                                                                  input,
                                                                  output,
                                                                  PassThrough{},
@@ -115,7 +132,44 @@ namespace hiptensor
             };
 
             auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
-            broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{stream, false});
+
+            // Perform contraction with timing if LOG_LEVEL_PERF_TRACE
+            using hiptensor::Logger;
+            auto& logger             = Logger::instance();
+            bool  measurePermuteTime = logger->getLogMask() & HIPTENSOR_LOG_LEVEL_PERF_TRACE;
+
+            auto permuteTime = broadcastPermute_invoker_ptr->Run(
+                argument.get(), StreamConfig{stream, measurePermuteTime});
+            if(measurePermuteTime)
+            {
+                std::size_t problemSize
+                    = std::accumulate(abLengths.begin(), abLengths.end(), 1, std::multiplies{});
+                std::size_t flops = std::size_t(2) * problemSize;
+
+                std::size_t bytes     = 2 * sizeof(DataType) * problemSize;
+                float       tflops    = static_cast<float>(flops) / 1.E9 / permuteTime;
+                float       bandwidth = bytes / 1.E6 / permuteTime;
+
+                hiptensor::PerfMetrics metrics = {
+                    0, // id, permute has only one solution, set id to 0
+                    "default solution", // name
+                    permuteTime, // avg time
+                    tflops, // tflops
+                    bandwidth // BW
+                };
+
+                // log perf metrics (not name/id)
+                char msg[2048];
+                snprintf(msg,
+                         sizeof(msg),
+                         "KernelId: %lu KernelName: %s, %0.3f ms, %0.3f TFlops, %0.3f GB/s",
+                         metrics.mKernelUid,
+                         metrics.mKernelName.c_str(),
+                         metrics.mAvgTimeMs,
+                         metrics.mTflops,
+                         metrics.mBandwidth);
+                logger->logPerformanceTrace("hiptensorPermutation", msg);
+            }
             return HIPTENSOR_STATUS_SUCCESS;
         }
     }
diff --git a/library/src/permutation/permutation_cpu_reference_impl.hpp b/library/src/permutation/permutation_cpu_reference_impl.hpp
index a04e4176..d64147fe 100644
--- a/library/src/permutation/permutation_cpu_reference_impl.hpp
+++ b/library/src/permutation/permutation_cpu_reference_impl.hpp
@@ -56,19 +56,26 @@ namespace hiptensor
                 bModeToIndex[modeB[index]] = index;
             }
 
-            auto& aLens = descA->mLengths;
-            // auto bStrides = descB->mStrides; // TODO descB->mStrides contains incorrect strides
-            auto bStrides = std::vector<int32_t>(modeSize, 1);
+            auto& aLens    = descA->mLengths;
+            auto  bStrides = std::vector<int32_t>(modeSize, 1);
+#if HIPTENSOR_DATA_LAYOUT_COL_MAJOR
             for(int i = 1; i < modeSize; i++)
             {
                 bStrides[i] = descB->mLengths[i - 1] * bStrides[i - 1];
             }
+#else // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
+            for(int i = modeSize - 2; i >= 0; i--)
+            {
+                bStrides[i] = descB->mLengths[i + 1] * bStrides[i + 1];
+            }
+#endif // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
             auto  bIndices     = std::vector<int32_t>(modeSize, 0);
             auto  elementCount = hiptensor::elementsFromLengths(aLens);
             float alphaValue   = readVal<float>(alpha, typeScalar);
             for(int elementIndex = 0; elementIndex < elementCount; elementIndex++)
             {
                 auto index = elementIndex;
+#if HIPTENSOR_DATA_LAYOUT_COL_MAJOR
                 for(int modeIndex = 0; modeIndex < modeSize; modeIndex++)
                 {
                     bIndices[bModeToIndex[modeA[modeIndex]]] = index % aLens[modeIndex];
@@ -76,6 +83,15 @@ namespace hiptensor
                 }
                 auto bOffset
                     = std::inner_product(bIndices.begin(), bIndices.end(), bStrides.begin(), 0);
+#else // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
+                for(int modeIndex = modeSize - 1; modeIndex >= 0; modeIndex--)
+                {
+                    bIndices[bModeToIndex[modeA[modeIndex]]] = index % aLens[modeIndex];
+                    index /= aLens[modeIndex];
+                }
+                auto bOffset
+                    = std::inner_product(bIndices.rbegin(), bIndices.rend(), bStrides.rbegin(), 0);
+#endif // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
                 B[bOffset] = static_cast<DataType>(A[elementIndex] * alphaValue);
             }
 
diff --git a/samples/02_permutation/permutation.cpp b/samples/02_permutation/permutation.cpp
index 2c95d076..ffae2d33 100644
--- a/samples/02_permutation/permutation.cpp
+++ b/samples/02_permutation/permutation.cpp
@@ -63,10 +63,10 @@ int main()
     int              nmodeC = modeC.size();
 
     std::unordered_map<int, int64_t> extent;
-    extent['h'] = 2;
-    extent['w'] = 3;
-    extent['c'] = 4;
-    extent['n'] = 5;
+    extent['h'] = 32;
+    extent['w'] = 33;
+    extent['c'] = 34;
+    extent['n'] = 35;
 
     std::vector<int64_t> extentA;
     for(auto mode : modeA)
@@ -107,6 +107,7 @@ int main()
     hiptensorStatus_t  err;
     hiptensorHandle_t* handle;
     CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
+    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
 
     hiptensorTensorDescriptor_t descA;
     CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(

From 53d863aff10647a30e1a89632036d255f5a582bb Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Wed, 15 Nov 2023 22:31:58 +0000
Subject: [PATCH 5/5] Add type related files

new file:   include/native_types.hpp
new file:   include/native_types_impl.hpp
new file:   include/type_traits.hpp
new file:   include/types_ext.hpp
new file:   include/xfloat32.hpp
---
 .../hiptensor/internal/hiptensor_utility.hpp  |   5 -
 library/src/CMakeLists.txt                    |   2 +-
 .../contraction/contraction_meta_traits.hpp   |   2 +-
 .../contraction_solution_params.hpp           |   2 +-
 .../contraction_solution_params_impl.hpp      |   2 +-
 .../contraction_solution_registry.hpp         |   2 +-
 library/src/{types.cpp => data_types.cpp}     |   2 +-
 library/src/hiptensor.cpp                     |   2 +-
 library/src/include/config.hpp                | 104 +++++
 library/src/include/data_types.hpp            |  78 ++++
 .../{types_impl.hpp => data_types_impl.hpp}   |   8 +-
 library/src/include/native_types.hpp          | 102 +++++
 library/src/include/native_types_impl.hpp     |  36 ++
 library/src/include/type_traits.hpp           | 390 ++++++++++++++++++
 library/src/include/types.hpp                 |  60 +--
 library/src/include/types_ext.hpp             | 181 ++++++++
 library/src/include/xfloat32.hpp              | 334 +++++++++++++++
 .../src/permutation/permutation_ck_impl.hpp   |   2 +-
 .../permutation_cpu_reference_impl.hpp        |   2 +-
 test/01_contraction/contraction_test.cpp      |   2 +-
 test/02_permutation/permutation_resource.cpp  |   2 +-
 test/02_permutation/permutation_test.cpp      |   2 +-
 test/utils.hpp                                |  16 +-
 23 files changed, 1248 insertions(+), 90 deletions(-)
 rename library/src/{types.cpp => data_types.cpp} (99%)
 create mode 100644 library/src/include/config.hpp
 create mode 100644 library/src/include/data_types.hpp
 rename library/src/include/{types_impl.hpp => data_types_impl.hpp} (97%)
 create mode 100644 library/src/include/native_types.hpp
 create mode 100644 library/src/include/native_types_impl.hpp
 create mode 100644 library/src/include/type_traits.hpp
 create mode 100644 library/src/include/types_ext.hpp
 create mode 100644 library/src/include/xfloat32.hpp

diff --git a/library/include/hiptensor/internal/hiptensor_utility.hpp b/library/include/hiptensor/internal/hiptensor_utility.hpp
index 5fd9e331..f2df2dd2 100644
--- a/library/include/hiptensor/internal/hiptensor_utility.hpp
+++ b/library/include/hiptensor/internal/hiptensor_utility.hpp
@@ -131,11 +131,6 @@ namespace std
 
         return os;
     }
-    static ostream& operator<<(ostream& os, const _Float16 value)
-    {
-        os << static_cast<float>(value);
-        return os;
-    }
 }
 
 #endif // HIPTENSOR_UTILITY_INTERNAL_HPP
diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt
index 801d0761..286ed2e3 100644
--- a/library/src/CMakeLists.txt
+++ b/library/src/CMakeLists.txt
@@ -62,7 +62,7 @@ set(HIPTENSOR_CORE_SOURCES
    ${CMAKE_CURRENT_SOURCE_DIR}/hiptensor.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/logger.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/performance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/types.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/data_types.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/hip_device.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/handle.cpp
 )
diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp
index e018661b..4fa7acf7 100644
--- a/library/src/contraction/contraction_meta_traits.hpp
+++ b/library/src/contraction/contraction_meta_traits.hpp
@@ -34,8 +34,8 @@
 #include <element_wise_operation.hpp>
 
 // hiptensor includes
+#include "data_types.hpp"
 #include "meta_traits.hpp"
-#include "types.hpp"
 
 namespace hiptensor
 {
diff --git a/library/src/contraction/contraction_solution_params.hpp b/library/src/contraction/contraction_solution_params.hpp
index f55d3c85..ec9de45c 100644
--- a/library/src/contraction/contraction_solution_params.hpp
+++ b/library/src/contraction/contraction_solution_params.hpp
@@ -30,7 +30,7 @@
 #include <hiptensor/hiptensor_types.hpp>
 
 #include "contraction_types.hpp"
-#include "types.hpp"
+#include "data_types.hpp"
 
 namespace hiptensor
 {
diff --git a/library/src/contraction/contraction_solution_params_impl.hpp b/library/src/contraction/contraction_solution_params_impl.hpp
index 194fb0d5..bff33960 100644
--- a/library/src/contraction/contraction_solution_params_impl.hpp
+++ b/library/src/contraction/contraction_solution_params_impl.hpp
@@ -29,8 +29,8 @@
 
 #include "contraction_meta_traits.hpp"
 #include "contraction_solution_params.hpp"
+#include "data_types.hpp"
 #include "hash.hpp"
-#include "types.hpp"
 
 namespace std
 {
diff --git a/library/src/contraction/contraction_solution_registry.hpp b/library/src/contraction/contraction_solution_registry.hpp
index 039e9e14..d1b80ec5 100644
--- a/library/src/contraction/contraction_solution_registry.hpp
+++ b/library/src/contraction/contraction_solution_registry.hpp
@@ -32,8 +32,8 @@
 #include <vector>
 
 #include "contraction_types.hpp"
+#include "data_types.hpp"
 #include "singleton.hpp"
-#include "types.hpp"
 
 namespace hiptensor
 {
diff --git a/library/src/types.cpp b/library/src/data_types.cpp
similarity index 99%
rename from library/src/types.cpp
rename to library/src/data_types.cpp
index 9cfb3290..b270973d 100644
--- a/library/src/types.cpp
+++ b/library/src/data_types.cpp
@@ -24,7 +24,7 @@
  *
  *******************************************************************************/
 
-#include "types.hpp"
+#include "data_types.hpp"
 
 namespace hiptensor
 {
diff --git a/library/src/hiptensor.cpp b/library/src/hiptensor.cpp
index 3016ef64..9740d2a8 100644
--- a/library/src/hiptensor.cpp
+++ b/library/src/hiptensor.cpp
@@ -27,9 +27,9 @@
 
 #include <hiptensor/hiptensor.hpp>
 
+#include "data_types.hpp"
 #include "handle.hpp"
 #include "logger.hpp"
-#include "types.hpp"
 #include "util.hpp"
 
 hiptensorStatus_t hiptensorCreate(hiptensorHandle_t** handle)
diff --git a/library/src/include/config.hpp b/library/src/include/config.hpp
new file mode 100644
index 00000000..3348968e
--- /dev/null
+++ b/library/src/include/config.hpp
@@ -0,0 +1,104 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTENSOR_CONFIG_HPP
+#define HIPTENSOR_CONFIG_HPP
+
+namespace hiptensor
+{
+
+///
+/// Architecture support
+/// Guaranteed symbols:
+/// HIPTENSOR_ARCH_GFX908
+/// HIPTENSOR_ARCH_GFX90a
+/// HIPTENSOR_ARCH_GFX940
+/// HIPTENSOR_ARCH_GFX941
+/// HIPTENSOR_ARCH_GFX942
+#if defined(__gfx908__)
+#define HIPTENSOR_ARCH_GFX908 __gfx908__
+#elif defined(__gfx90a__)
+#define HIPTENSOR_ARCH_GFX90A __gfx90a__
+#elif defined(__gfx940__)
+#define HIPTENSOR_ARCH_GFX940 __gfx940__
+#elif defined(__gfx941__)
+#define HIPTENSOR_ARCH_GFX941 __gfx941__
+#elif defined(__gfx942__)
+#define HIPTENSOR_ARCH_GFX942 __gfx942__
+#else
+#define HIPTENSOR_ARCH_HOST 1
+#endif
+
+#if !defined(HIPTENSOR_ARCH_GFX908)
+#define HIPTENSOR_ARCH_GFX908 0
+#endif
+#if !defined(HIPTENSOR_ARCH_GFX90A)
+#define HIPTENSOR_ARCH_GFX90A 0
+#endif
+#if !defined(HIPTENSOR_ARCH_GFX940)
+#define HIPTENSOR_ARCH_GFX940 0
+#endif
+#if !defined(HIPTENSOR_ARCH_GFX941)
+#define HIPTENSOR_ARCH_GFX941 0
+#endif
+#if !defined(HIPTENSOR_ARCH_GFX942)
+#define HIPTENSOR_ARCH_GFX942 0
+#endif
+#if !defined(HIPTENSOR_ARCH_HOST)
+#define HIPTENSOR_ARCH_HOST 0
+#endif
+
+#if defined(NDEBUG)
+#define HIPTENSOR_UNSUPPORTED_IMPL(MSG)
+#else
+#define HIPTENSOR_UNSUPPORTED_IMPL(MSG) __attribute__((deprecated(MSG)))
+#endif
+
+#if defined(HIP_NO_HALF)
+#define HIPTENSOR_NO_HALF 1
+#else
+#define HIPTENSOR_NO_HALF 0
+#endif // HIP_NO_HALF
+
+#if HIPTENSOR_NO_HALF || (!HIPTENSOR_NO_HALF && defined(__HIP_NO_HALF_CONVERSIONS__))
+#define HIPTENSOR_TESTS_NO_HALF 1
+#else
+#define HIPTENSOR_TESTS_NO_HALF 0
+#endif // !HIPTENSOR_NO_HALF && defined(__HIP_NO_HALF_CONVERSIONS__)
+
+///
+/// Host and Device symbols
+///
+#define HIPTENSOR_DEVICE __device__
+
+#define HIPTENSOR_HOST __host__
+
+#define HIPTENSOR_HOST_DEVICE HIPTENSOR_HOST HIPTENSOR_DEVICE
+
+#define HIPTENSOR_KERNEL __global__
+
+} // namespace hiptensor
+
+#endif // HIPTENSOR_CONFIG_HPP
diff --git a/library/src/include/data_types.hpp b/library/src/include/data_types.hpp
new file mode 100644
index 00000000..42197650
--- /dev/null
+++ b/library/src/include/data_types.hpp
@@ -0,0 +1,78 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_LIBRARY_DATA_TYPES_HPP
+#define HIPTENSOR_LIBRARY_DATA_TYPES_HPP
+
+// clang-format off
+// Include order needs to be preserved
+#include <hip/library_types.h>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_fp16.h>
+#include <iostream>
+
+#include <hiptensor/hiptensor_types.hpp>
+
+// clang-format on
+
+namespace hiptensor
+{
+    // Used to map to empty tensors
+    struct NoneType;
+
+    static constexpr hipDataType NONE_TYPE = (hipDataType)31;
+
+    // Map type to runtime HipDataType
+    template <typename T>
+    struct HipDataType;
+
+    template <typename T>
+    static constexpr auto HipDataType_v = HipDataType<T>::value;
+
+    // Get data size in bytes from id
+    uint32_t hipDataTypeSize(hipDataType id);
+
+    // Convert hipDataType to hiptensorComputeType_t
+    hiptensorComputeType_t convertToComputeType(hipDataType hipType);
+
+    // Read a single value from void pointer, casted to T
+    template <typename T>
+    T readVal(void const* value, hipDataType id);
+
+    template <typename T>
+    T readVal(void const* value, hiptensorComputeType_t id);
+
+} // namespace hiptensor
+
+bool operator==(hipDataType hipType, hiptensorComputeType_t computeType);
+bool operator==(hiptensorComputeType_t computeType, hipDataType hipType);
+
+bool operator!=(hipDataType hipType, hiptensorComputeType_t computeType);
+bool operator!=(hiptensorComputeType_t computeType, hipDataType hipType);
+
+#include "data_types_impl.hpp"
+
+#endif // HIPTENSOR_LIBRARY_DATA_TYPES_HPP
diff --git a/library/src/include/types_impl.hpp b/library/src/include/data_types_impl.hpp
similarity index 97%
rename from library/src/include/types_impl.hpp
rename to library/src/include/data_types_impl.hpp
index 7a07d0b0..7df6d7d9 100644
--- a/library/src/include/types_impl.hpp
+++ b/library/src/include/data_types_impl.hpp
@@ -24,10 +24,10 @@
  *
  *******************************************************************************/
 
-#ifndef HIPTENSOR_LIBRARY_TYPES_IMPL_HPP
-#define HIPTENSOR_LIBRARY_TYPES_IMPL_HPP
+#ifndef HIPTENSOR_LIBRARY_DATA_TYPES_IMPL_HPP
+#define HIPTENSOR_LIBRARY_DATA_TYPES_IMPL_HPP
 
-#include "types.hpp"
+#include "data_types.hpp"
 
 namespace hiptensor
 {
@@ -217,4 +217,4 @@ namespace hiptensor
 
 } // namespace hiptensor
 
-#endif // HIPTENSOR_LIBRARY_TYPES_IMPL_HPP
+#endif // HIPTENSOR_LIBRARY_DATA_TYPES_IMPL_HPP
diff --git a/library/src/include/native_types.hpp b/library/src/include/native_types.hpp
new file mode 100644
index 00000000..6c9dbee8
--- /dev/null
+++ b/library/src/include/native_types.hpp
@@ -0,0 +1,102 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTENSOR_NATIVE_TYPES_HPP
+#define HIPTENSOR_NATIVE_TYPES_HPP
+
+#include <array>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_vector_types.h>
+#include <type_traits>
+#include <utility>
+
+#include "xfloat32.hpp"
+
+namespace hiptensor
+{
+
+    /**
+ * \defgroup DataTypes Data Type Metadata
+ *
+ * @brief Definition and metadata on supported data types of matrices.
+ *
+ * @{
+ *
+ * Native Data Types:
+ * float64_t = f64 = double
+ * float = f32
+ * _Float16 = f16
+ * int8
+ * uint8
+ * int16
+ * int32
+ * uint32
+ *
+ *
+ * Non-Native Data Types:
+ * h16 = __half
+ * bf16 = bfloat16
+ *
+ */
+
+    // Native types
+    using float16_t = _Float16;
+    using float32_t = float;
+    using float64_t = double;
+
+    using int8_t    = ::int8_t;
+    using uint8_t   = ::uint8_t;
+    using int16_t   = ::int16_t;
+    using uint16_t  = ::uint16_t;
+    using int32_t   = ::int32_t;
+    using uint32_t  = ::uint32_t;
+    using int64_t   = ::int64_t;
+    using uint64_t  = ::uint64_t;
+    using index_t   = ::int32_t;
+    using index64_t = ::int64_t;
+
+    // Non-native types
+    using bfloat16_t = hip_bfloat16;
+
+#if !HIPTENSOR_NO_HALF
+    using hfloat16_t = __half;
+#endif // !HIPTENSOR_NO_HALF
+
+    using xfloat32_t = hiptensor_xfloat32;
+
+    // clang-format off
+
+
+} // namespace hiptensor
+
+// Add in some extensions to basic type support.
+// Some of these are required for vector implementations.
+// #include "type_traits.hpp"
+// #include "types_ext.hpp"
+
+#include "native_types_impl.hpp"
+
+#endif // HIPTENSOR_NATIVE_TYPES_HPP
diff --git a/library/src/include/native_types_impl.hpp b/library/src/include/native_types_impl.hpp
new file mode 100644
index 00000000..1b29b459
--- /dev/null
+++ b/library/src/include/native_types_impl.hpp
@@ -0,0 +1,36 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTENSOR_NATIVE_TYPES_IMPL_HPP
+#define HIPTENSOR_NATIVE_TYPES_IMPL_HPP
+
+#include "native_types.hpp"
+
+namespace hiptensor
+{
+
+} // namespace hiptensor
+
+#endif // HIPTENSOR_NATIVE_TYPES_IMPL_HPP
diff --git a/library/src/include/type_traits.hpp b/library/src/include/type_traits.hpp
new file mode 100644
index 00000000..3867839d
--- /dev/null
+++ b/library/src/include/type_traits.hpp
@@ -0,0 +1,390 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_TYPE_TRAITS_HPP
+#define HIPTENSOR_TYPE_TRAITS_HPP
+#include "native_types.hpp"
+#include <cfloat>
+
+namespace hiptensor
+{
+    namespace detail
+    {
+        struct Fp16Bits
+        {
+            union
+            {
+                uint16_t  i16;
+                float16_t f16;
+#if !HIPTENSOR_NO_HALF
+                hfloat16_t h16;
+#endif // !HIPTENSOR_NO_HALF
+                bfloat16_t b16;
+            };
+            constexpr Fp16Bits(uint16_t initVal)
+                : i16(initVal)
+            {
+            }
+#define TEST_TEST 1
+            constexpr Fp16Bits(float16_t initVal)
+                : f16(initVal)
+            {
+            }
+#if !HIPTENSOR_NO_HALF
+            constexpr Fp16Bits(hfloat16_t initVal)
+                : h16(initVal)
+            {
+            }
+#endif
+            constexpr Fp16Bits(bfloat16_t initVal)
+                : b16(initVal)
+            {
+            }
+        };
+
+        struct Fp32Bits
+        {
+            union
+            {
+                uint32_t   i32;
+                float32_t  f32;
+                xfloat32_t xf32;
+            };
+            constexpr Fp32Bits(uint32_t initVal)
+                : i32(initVal)
+            {
+            }
+            constexpr Fp32Bits(float32_t initVal)
+                : f32(initVal)
+            {
+            }
+            constexpr Fp32Bits(xfloat32_t initVal)
+                : xf32(initVal)
+            {
+            }
+        };
+
+    } // namespace detail
+} // namespace hiptensor
+
+namespace std
+{
+    ///////////////////////////////////////////////////////////
+    ///////////  std::numeric_limits<float16_t>  //////////////
+    ///////////////////////////////////////////////////////////
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t
+        numeric_limits<hiptensor::float16_t>::epsilon() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x1400));
+        return eps.f16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t
+        numeric_limits<hiptensor::float16_t>::infinity() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7C00));
+        return eps.f16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t
+        numeric_limits<hiptensor::float16_t>::lowest() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0xFBFF));
+        return eps.f16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t
+        numeric_limits<hiptensor::float16_t>::max() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7BFF));
+        return eps.f16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t
+        numeric_limits<hiptensor::float16_t>::min() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x0400));
+        return eps.f16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t
+        numeric_limits<hiptensor::float16_t>::quiet_NaN() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7FFF));
+        return eps.f16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t
+        numeric_limits<hiptensor::float16_t>::signaling_NaN() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7DFF));
+        return eps.f16;
+    }
+
+    ///////////////////////////////////////////////////////////
+    ///////////  std::numeric_limits<hfloat16_t>  /////////////
+    ///////////////////////////////////////////////////////////
+#if !HIPTENSOR_NO_HALF
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::hfloat16_t
+        numeric_limits<hiptensor::hfloat16_t>::epsilon() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x1400));
+        return eps.h16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::hfloat16_t
+        numeric_limits<hiptensor::hfloat16_t>::infinity() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7C00));
+        return eps.h16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::hfloat16_t
+        numeric_limits<hiptensor::hfloat16_t>::lowest() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0xFBFF));
+        return eps.h16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::hfloat16_t
+        numeric_limits<hiptensor::hfloat16_t>::max() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7BFF));
+        return eps.h16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::hfloat16_t
+        numeric_limits<hiptensor::hfloat16_t>::min() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x0400));
+        return eps.h16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::hfloat16_t
+        numeric_limits<hiptensor::hfloat16_t>::quiet_NaN() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7FFF));
+        return eps.h16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::hfloat16_t
+        numeric_limits<hiptensor::hfloat16_t>::signaling_NaN() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7DFF));
+        return eps.h16;
+    }
+
+#endif // !HIPTENSOR_NO_HALF
+
+    ///////////////////////////////////////////////////////////
+    ///////////  std::numeric_limits<bfloat16_t>  /////////////
+    ///////////////////////////////////////////////////////////
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::bfloat16_t
+        numeric_limits<hiptensor::bfloat16_t>::epsilon() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x3C00));
+        return eps.b16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::bfloat16_t
+        numeric_limits<hiptensor::bfloat16_t>::infinity() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7F80));
+        return eps.b16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::bfloat16_t
+        numeric_limits<hiptensor::bfloat16_t>::lowest() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0xFF7F));
+        return eps.b16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::bfloat16_t
+        numeric_limits<hiptensor::bfloat16_t>::max() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7F7F));
+        return eps.b16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::bfloat16_t
+        numeric_limits<hiptensor::bfloat16_t>::min() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x007F));
+        return eps.b16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::bfloat16_t
+        numeric_limits<hiptensor::bfloat16_t>::quiet_NaN() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7FC0));
+        return eps.b16;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::bfloat16_t
+        numeric_limits<hiptensor::bfloat16_t>::signaling_NaN() noexcept
+    {
+        hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7FC0));
+        return eps.b16;
+    }
+
+    ///////////////////////////////////////////////////////////
+    ///////////  std::numeric_limits<xfloat32_t>  //////////////
+    ///////////////////////////////////////////////////////////
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::epsilon() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_EPSILON));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::infinity() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<float>(HUGE_VALF));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::lowest() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<float>(-FLT_MAX));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::max() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_MAX));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::min() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_MIN));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::quiet_NaN() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<uint32_t>(0x7FF80000));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::signaling_NaN() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<uint32_t>(0x7FF00000));
+        return eps.xf32;
+    }
+    // @endcond
+
+} // namespace std
+
+namespace hiptensor
+{
+    template <typename T, typename std::enable_if_t<std::is_integral<T>::value, int> = 0>
+    constexpr auto maxExactInteger() -> decltype(std::numeric_limits<T>::max())
+    {
+        return std::numeric_limits<T>::max();
+    }
+
+    template <typename T,
+              typename std::enable_if_t<std::is_floating_point<T>::value
+                                            && std::numeric_limits<T>::digits,
+                                        int>
+              = 0>
+    constexpr auto maxExactInteger() ->
+        typename std::conditional_t<std::is_same<T, float64_t>::value, int64_t, int32_t>
+    {
+        using RetT =
+            typename std::conditional_t<std::is_same<T, float64_t>::value, int64_t, int32_t>;
+        return ((RetT)1 << std::numeric_limits<T>::digits);
+    }
+
+    template <typename T,
+              typename std::enable_if_t<
+#if !HIPTENSOR_NO_HALF
+                  std::is_same<T, hfloat16_t>::value ||
+#endif // !HIPTENSOR_NO_HALF
+                      std::is_same<T, float16_t>::value,
+                  int>
+              = 0>
+    constexpr auto maxExactInteger() -> int32_t
+    {
+        // f16 mantissa is 10 bits
+        return ((int32_t)1 << 11);
+    }
+
+    template <typename T, typename std::enable_if_t<std::is_same<T, bfloat16_t>::value, int> = 0>
+    constexpr auto maxExactInteger() -> int32_t
+    {
+        // b16 mantissa is 7 bits
+        return ((int32_t)1 << 8);
+    }
+
+    template <typename T, typename std::enable_if_t<std::is_same<T, xfloat32_t>::value, int> = 0>
+    constexpr auto maxExactInteger() -> int32_t
+    {
+        // xf32 mantissa is 7 bits
+        return ((int32_t)1 << 8);
+    }
+} // namespace hiptensor
+
+#endif // HIPTENSOR_TYPE_TRAITS_HPP
diff --git a/library/src/include/types.hpp b/library/src/include/types.hpp
index 8cd677f3..b59604be 100644
--- a/library/src/include/types.hpp
+++ b/library/src/include/types.hpp
@@ -19,60 +19,10 @@
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  *
  *******************************************************************************/
-
-#ifndef HIPTENSOR_LIBRARY_TYPES_HPP
-#define HIPTENSOR_LIBRARY_TYPES_HPP
-
-// clang-format off
-// Include order needs to be preserved
-#include <hip/library_types.h>
-#include <hip/hip_bfloat16.h>
-#include <hip/hip_fp16.h>
-#include <iostream>
-
-#include <hiptensor/hiptensor_types.hpp>
-
-// clang-format on
-
-namespace hiptensor
-{
-    // Used to map to empty tensors
-    struct NoneType;
-
-    static constexpr hipDataType NONE_TYPE = (hipDataType)31;
-
-    // Map type to runtime HipDataType
-    template <typename T>
-    struct HipDataType;
-
-    template <typename T>
-    static constexpr auto HipDataType_v = HipDataType<T>::value;
-
-    // Get data size in bytes from id
-    uint32_t hipDataTypeSize(hipDataType id);
-
-    // Convert hipDataType to hiptensorComputeType_t
-    hiptensorComputeType_t convertToComputeType(hipDataType hipType);
-
-    // Read a single value from void pointer, casted to T
-    template <typename T>
-    T readVal(void const* value, hipDataType id);
-
-    template <typename T>
-    T readVal(void const* value, hiptensorComputeType_t id);
-
-} // namespace hiptensor
-
-bool operator==(hipDataType hipType, hiptensorComputeType_t computeType);
-bool operator==(hiptensorComputeType_t computeType, hipDataType hipType);
-
-bool operator!=(hipDataType hipType, hiptensorComputeType_t computeType);
-bool operator!=(hiptensorComputeType_t computeType, hipDataType hipType);
-
-#include "types_impl.hpp"
-
-#endif // HIPTENSOR_LIBRARY_TYPES_HPP
+#include "native_types.hpp"
+#include "type_traits.hpp"
+#include "types_ext.hpp"
diff --git a/library/src/include/types_ext.hpp b/library/src/include/types_ext.hpp
new file mode 100644
index 00000000..2f2b09e1
--- /dev/null
+++ b/library/src/include/types_ext.hpp
@@ -0,0 +1,181 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTENSOR_TYPES_EXT_HPP
+#define HIPTENSOR_TYPES_EXT_HPP
+
+#include <cmath>
+#include <hip/hip_bfloat16.h>
+#include <limits>
+#include <ostream>
+
+#include "type_traits.hpp"
+
+namespace hiptensor
+{
+    ////////////////////////////////////////////////////////////////////////
+    ///////////  hiptensor::hfloat16_t host and device conversions  //////////
+    ////////////////////////////////////////////////////////////////////////
+    template <typename Outgoing,
+              typename Incoming,
+              typename std::enable_if_t<!std::is_same_v<Incoming, Outgoing>, int> = 0>
+    __host__ __device__ inline Outgoing convert(const Incoming& value)
+    {
+#if !HIPTENSOR_NO_HALF
+        if constexpr(std::is_same_v<Outgoing, hfloat16_t>)
+        {
+
+#if defined(__HIP_NO_HALF_CONVERSIONS__)
+            detail::Fp16Bits fp16(static_cast<float16_t>(value));
+            return fp16.h16;
+#else
+            return static_cast<hfloat16_t>(value);
+#endif // defined(__HIP_NO_HALF_CONVERSIONS__)
+        }
+        else if constexpr(std::is_same_v<Incoming, hfloat16_t>)
+        {
+
+#if defined(__HIP_NO_HALF_CONVERSIONS__)
+            detail::Fp16Bits fp16(value);
+            return static_cast<Outgoing>(fp16.f16);
+#else
+            return static_cast<Outgoing>(value);
+#endif // defined(__HIP_NO_HALF_CONVERSIONS__)
+        }
+        else
+#endif // !HIPTENSOR_NO_HALF
+        {
+            return static_cast<Outgoing>(value);
+        }
+    }
+
+    template <typename Outgoing,
+              typename Incoming,
+              typename std::enable_if_t<std::is_same_v<Incoming, Outgoing>, int> = 0>
+    __host__ __device__ inline Outgoing const& convert(const Incoming& value)
+    {
+        return value;
+    }
+
+    ////////////////////////////////////////////////////////////////////
+    ///////////  hiptensor::hfloat16_t host & device operators  //////////
+    ///////////////////////////////////////////////////////////////////
+
+#if defined(__HIP_NO_HALF_OPERATORS__)
+// No operators defined for host or device
+#define HIPTENSOR_HALF_OP_ATTR HIPTENSOR_HOST_DEVICE
+#else
+// No operators defined just for host
+#define HIPTENSOR_HALF_OP_ATTR HIPTENSOR_HOST
+#endif // defined(__HIP_NO_HALF_OPERATORS__)
+
+#if !HIPTENSOR_NO_HALF
+
+    HIPTENSOR_HALF_OP_ATTR inline bool operator==(const hfloat16_t& x, const hfloat16_t& y)
+    {
+        auto absDiff = std::fabs(__half2float(x) - __half2float(y));
+        auto absAdd  = std::fabs(__half2float(x) + __half2float(y));
+        return absDiff <= __half2float(std::numeric_limits<hfloat16_t>::epsilon()) * absAdd * 2.0f
+               || absDiff < __half2float(std::numeric_limits<hfloat16_t>::min());
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline bool operator!=(const hfloat16_t& x, const hfloat16_t& y)
+    {
+        return !(x == y);
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t operator-(const hfloat16_t& x)
+    {
+        detail::Fp16Bits fp16(x);
+        fp16.i16 ^= 0x8000; // Flip sign
+        return fp16.h16;
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t operator+(const hfloat16_t& x, const hfloat16_t& y)
+    {
+        return convert<hfloat16_t>(convert<float16_t>(x) + convert<float16_t>(y));
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t operator-(const hfloat16_t& x, const hfloat16_t& y)
+    {
+        return convert<hfloat16_t>(convert<float16_t>(x) - convert<float16_t>(y));
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t operator*(const hfloat16_t& x, const hfloat16_t& y)
+    {
+        return convert<hfloat16_t>(convert<float16_t>(x) * convert<float16_t>(y));
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t operator/(const hfloat16_t& x, const hfloat16_t& y)
+    {
+        return convert<hfloat16_t>(convert<float16_t>(x) / convert<float16_t>(y));
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t& operator+=(hfloat16_t& x, const hfloat16_t& y)
+    {
+        return x = x + y;
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t& operator-=(hfloat16_t& x, const hfloat16_t& y)
+    {
+        return x = x - y;
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t& operator*=(hfloat16_t& x, const hfloat16_t& y)
+    {
+        return x = x * y;
+    }
+
+    HIPTENSOR_HALF_OP_ATTR inline hfloat16_t& operator/=(hfloat16_t& x, const hfloat16_t& y)
+    {
+        return x = x / y;
+    }
+
+#endif // !HIPTENSOR_NO_HALF
+} // namespace hiptensor
+
+namespace std
+{
+    ///////////////////////////////////////////////////////////
+    //////////  std::ostream::operator<<(float16_t)  //////////
+    ///////////////////////////////////////////////////////////
+
+    inline ostream& operator<<(ostream& stream, hiptensor::float16_t const& val)
+    {
+        return stream << static_cast<float>(val);
+    }
+
+    ///////////////////////////////////////////////////////////
+    //////////  std::ostream::operator<<(hfloat16_t)  /////////
+    ///////////////////////////////////////////////////////////
+#if !HIPTENSOR_NO_HALF
+    inline ostream& operator<<(ostream& stream, hiptensor::hfloat16_t const& val)
+    {
+        return stream << __half2float(val);
+    }
+#endif // !HIPTENSOR_NO_HALF
+} // namespace std
+
+#endif // HIPTENSOR_TYPES_EXT_HPP
diff --git a/library/src/include/xfloat32.hpp b/library/src/include/xfloat32.hpp
new file mode 100644
index 00000000..6e9168cf
--- /dev/null
+++ b/library/src/include/xfloat32.hpp
@@ -0,0 +1,334 @@
+/* ************************************************************************
+ * Copyright (C) 2016-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+
+/*!\file
+ * \brief xfloat32.h provides struct for hiptensor_xfloat32 typedef
+ */
+
+#ifndef HIPTENSOR_XFLOAT32_HPP
+#define HIPTENSOR_XFLOAT32_HPP
+
+#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+
+// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
+// include a minimal definition of hiptensor_xfloat32
+
+#include <stdint.h>
+typedef struct
+{
+    float data;
+} hiptensor_xfloat32;
+
+#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <hip/hip_runtime.h>
+#include <ostream>
+#include <type_traits>
+
+#include "config.hpp"
+
+struct hiptensor_xfloat32
+{
+    float data;
+
+    enum round_t
+    {
+        round_up
+    };
+
+    HIPTENSOR_HOST_DEVICE hiptensor_xfloat32() = default;
+
+    // round upper 19 bits of IEEE float to convert to xfloat32
+    explicit HIPTENSOR_HOST_DEVICE hiptensor_xfloat32(float f, round_t)
+        : data(float_to_xfloat32(f))
+    {
+    }
+
+    explicit HIPTENSOR_HOST_DEVICE hiptensor_xfloat32(float f)
+        : data(truncate_float_to_xfloat32(f))
+    {
+    }
+
+    // zero extend lower 13 bits of xfloat32 to convert to IEEE float
+    HIPTENSOR_HOST_DEVICE operator float() const
+    {
+        return data;
+    }
+
+    explicit HIPTENSOR_HOST_DEVICE operator bool() const
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {data};
+        return u.int32 & 0x7fffe000;
+    }
+
+    explicit HIPTENSOR_HOST_DEVICE operator uint32_t() const
+    {
+        return uint32_t(float(*this));
+    }
+
+    explicit HIPTENSOR_HOST_DEVICE operator long() const
+    {
+        return long(float(*this));
+    }
+
+    explicit HIPTENSOR_HOST_DEVICE operator double() const
+    {
+        return double(float(*this));
+    }
+
+private:
+    static HIPTENSOR_HOST_DEVICE float float_to_xfloat32(float f)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {f};
+        if(~u.int32 & 0x7f800000)
+        {
+            // When the exponent bits are not all 1s, then the value is zero, normal,
+            // or subnormal. We round the xfloat32 mantissa up by adding 0xFFF, plus
+            // 1 if the least significant bit of the xfloat32 mantissa is 1 (odd).
+            // This causes the xfloat32's mantissa to be incremented by 1 if the 13
+            // least significant bits of the float mantissa are greater than 0x1000,
+            // or if they are equal to 0x1000 and the least significant bit of the
+            // xfloat32 mantissa is 1 (odd). This causes it to be rounded to even when
+            // the lower 13 bits are exactly 0x1000. If the xfloat32 mantissa already
+            // has the value 0x3ff, then incrementing it causes it to become 0x00 and
+            // the exponent is incremented by one, which is the next higher FP value
+            // to the unrounded xfloat32 value. When the xfloat32 value is subnormal
+            // with an exponent of 0x00 and a mantissa of 0x3FF, it may be rounded up
+            // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+            // When the xfloat32 value has an exponent of 0xFE and a mantissa of 0x3FF,
+            // incrementing it causes it to become an exponent of 0xFF and a mantissa
+            // of 0x00, which is Inf, the next higher value to the unrounded value.
+
+            u.int32 += 0xfff + ((u.int32 >> 13) & 1); // Round to nearest, round to even
+        }
+        else if(u.int32 & 0x1fff)
+        {
+            // When all of the exponent bits are 1, the value is Inf or NaN.
+            // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+            // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+            // bit being 1. Signaling NaN is indicated by the most significant
+            // mantissa bit being 0 but some other bit(s) being 1. If any of the
+            // lower 13 bits of the mantissa are 1, we set the least significant bit
+            // of the xfloat32 mantissa, in order to preserve signaling NaN in case
+            // the xfloat32's mantissa bits are all 0.
+            u.int32 |= 0x2000; // Preserve signaling NaN
+        }
+
+        u.int32 &= 0xffffe000;
+        return u.fp32;
+    }
+
+    // Truncate instead of rounding
+    static HIPTENSOR_HOST_DEVICE float truncate_float_to_xfloat32(float f)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {f};
+
+        u.int32 = u.int32 & 0xffffe000;
+        return u.fp32;
+    }
+};
+
+typedef struct
+{
+    float data;
+} hiptensor_xfloat32_public;
+
+static_assert(std::is_standard_layout<hiptensor_xfloat32>{},
+              "hiptensor_xfloat32 is not a standard layout type, and thus is "
+              "incompatible with C.");
+
+static_assert(std::is_trivial<hiptensor_xfloat32>{},
+              "hiptensor_xfloat32 is not a trivial type, and thus is "
+              "incompatible with C.");
+
+static_assert(sizeof(hiptensor_xfloat32) == sizeof(hiptensor_xfloat32_public)
+                  && offsetof(hiptensor_xfloat32, data)
+                         == offsetof(hiptensor_xfloat32_public, data),
+              "internal hiptensor_xfloat32 does not match public hiptensor_xfloat32");
+
+inline std::ostream& operator<<(std::ostream& os, const hiptensor_xfloat32& xf32)
+{
+    return os << float(xf32);
+}
+
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator+(hiptensor_xfloat32 a)
+{
+    return a;
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator-(hiptensor_xfloat32 a)
+{
+    union
+    {
+        float    fp32;
+        uint32_t int32;
+    } u = {a.data};
+    u.int32 ^= 0x80000000;
+    return hiptensor_xfloat32(u.fp32);
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator+(hiptensor_xfloat32 a,
+                                                          hiptensor_xfloat32 b)
+{
+    return hiptensor_xfloat32(float(a) + float(b));
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator-(hiptensor_xfloat32 a,
+                                                          hiptensor_xfloat32 b)
+{
+    return hiptensor_xfloat32(float(a) - float(b));
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator*(hiptensor_xfloat32 a,
+                                                          hiptensor_xfloat32 b)
+{
+    return hiptensor_xfloat32(float(a) * float(b));
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator/(hiptensor_xfloat32 a,
+                                                          hiptensor_xfloat32 b)
+{
+    return hiptensor_xfloat32(float(a) / float(b));
+}
+inline HIPTENSOR_HOST_DEVICE bool operator<(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
+{
+    return float(a) < float(b);
+}
+inline HIPTENSOR_HOST_DEVICE bool operator==(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
+{
+    return float(a) == float(b);
+}
+inline HIPTENSOR_HOST_DEVICE bool operator>(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
+{
+    return b < a;
+}
+inline HIPTENSOR_HOST_DEVICE bool operator<=(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
+{
+    return !(a > b);
+}
+inline HIPTENSOR_HOST_DEVICE bool operator!=(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
+{
+    return !(a == b);
+}
+inline HIPTENSOR_HOST_DEVICE bool operator>=(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
+{
+    return !(a < b);
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator+=(hiptensor_xfloat32& a,
+                                                            hiptensor_xfloat32  b)
+{
+    return a = a + b;
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator-=(hiptensor_xfloat32& a,
+                                                            hiptensor_xfloat32  b)
+{
+    return a = a - b;
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator*=(hiptensor_xfloat32& a,
+                                                            hiptensor_xfloat32  b)
+{
+    return a = a * b;
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator/=(hiptensor_xfloat32& a,
+                                                            hiptensor_xfloat32  b)
+{
+    return a = a / b;
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator++(hiptensor_xfloat32& a)
+{
+    return a += hiptensor_xfloat32(1.0f);
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator--(hiptensor_xfloat32& a)
+{
+    return a -= hiptensor_xfloat32(1.0f);
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator++(hiptensor_xfloat32& a, int)
+{
+    hiptensor_xfloat32 orig = a;
+    ++a;
+    return orig;
+}
+inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator--(hiptensor_xfloat32& a, int)
+{
+    hiptensor_xfloat32 orig = a;
+    --a;
+    return orig;
+}
+
+namespace std
+{
+    constexpr HIPTENSOR_HOST_DEVICE bool isinf(hiptensor_xfloat32 a)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {a.data};
+        return !(~u.int32 & 0x7f800000) && !(u.int32 & 0x7fe000);
+    }
+    constexpr HIPTENSOR_HOST_DEVICE bool isnan(hiptensor_xfloat32 a)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {a.data};
+        return !(~u.int32 & 0x7f800000) && +(u.int32 & 0x7fe000);
+    }
+    constexpr HIPTENSOR_HOST_DEVICE bool iszero(hiptensor_xfloat32 a)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {a.data};
+        return (u.fp32 == 0.0f);
+    }
+
+    HIPTENSOR_HOST_DEVICE inline hiptensor_xfloat32 sin(hiptensor_xfloat32 a)
+    {
+        return hiptensor_xfloat32(sinf(float(a)));
+    }
+    HIPTENSOR_HOST_DEVICE inline hiptensor_xfloat32 cos(hiptensor_xfloat32 a)
+    {
+        return hiptensor_xfloat32(cosf(float(a)));
+    }
+
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor_xfloat32 real(const hiptensor_xfloat32& a)
+    {
+        return a;
+    }
+}
+
+#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+
+#endif // HIPTENSOR_XFLOAT32_HPP
diff --git a/library/src/permutation/permutation_ck_impl.hpp b/library/src/permutation/permutation_ck_impl.hpp
index 7878e908..0c73ccff 100644
--- a/library/src/permutation/permutation_ck_impl.hpp
+++ b/library/src/permutation/permutation_ck_impl.hpp
@@ -32,8 +32,8 @@
 #include <ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp>
 #include <ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp>
 
+#include "data_types.hpp"
 #include "performance.hpp"
-#include "types.hpp"
 
 namespace hiptensor
 {
diff --git a/library/src/permutation/permutation_cpu_reference_impl.hpp b/library/src/permutation/permutation_cpu_reference_impl.hpp
index d64147fe..c1d4a3af 100644
--- a/library/src/permutation/permutation_cpu_reference_impl.hpp
+++ b/library/src/permutation/permutation_cpu_reference_impl.hpp
@@ -29,8 +29,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "data_types.hpp"
 #include "permutation_cpu_reference.hpp"
-#include "types.hpp"
 #include "util.hpp"
 
 namespace hiptensor
diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp
index 4ad97610..5d745d12 100644
--- a/test/01_contraction/contraction_test.cpp
+++ b/test/01_contraction/contraction_test.cpp
@@ -25,7 +25,7 @@
  *******************************************************************************/
 #include <hiptensor/hiptensor.hpp>
 
-#include "types.hpp"
+#include "data_types.hpp"
 #include "llvm/hiptensor_options.hpp"
 
 #include "contraction/contraction_cpu_reference.hpp"
diff --git a/test/02_permutation/permutation_resource.cpp b/test/02_permutation/permutation_resource.cpp
index 4323e01d..1f448ff8 100644
--- a/test/02_permutation/permutation_resource.cpp
+++ b/test/02_permutation/permutation_resource.cpp
@@ -28,7 +28,7 @@
 #define HIPTENSOR_PERMUTATION_RESOURCE_IMPL_HPP
 
 #include "permutation_resource.hpp"
-#include "types.hpp"
+#include "data_types.hpp"
 #include "utils.hpp"
 
 namespace hiptensor
diff --git a/test/02_permutation/permutation_test.cpp b/test/02_permutation/permutation_test.cpp
index dbb52d6e..cfadf5c0 100644
--- a/test/02_permutation/permutation_test.cpp
+++ b/test/02_permutation/permutation_test.cpp
@@ -25,10 +25,10 @@
  *******************************************************************************/
 #include <hiptensor/hiptensor.hpp>
 
+#include "data_types.hpp"
 #include "logger.hpp"
 #include "permutation/permutation_cpu_reference.hpp"
 #include "permutation_test.hpp"
-#include "types.hpp"
 #include "utils.hpp"
 #include "llvm/hiptensor_options.hpp"
 
diff --git a/test/utils.hpp b/test/utils.hpp
index 67418568..1f7ece44 100644
--- a/test/utils.hpp
+++ b/test/utils.hpp
@@ -43,6 +43,7 @@
 #include <hiptensor/internal/hiptensor_utility.hpp>
 
 #include "device/common.hpp"
+#include "types.hpp"
 
 #define HIPTENSOR_FREE_DEVICE(ptr)     \
     if(ptr != nullptr)                 \
@@ -209,19 +210,6 @@ std::pair<bool, double> compareEqual(DDataType const* deviceD,
     return std::make_pair(retval, max_relative_error);
 }
 
-template <typename DDataType>
-double getEpsilon()
-{
-    if(std::is_same_v<DDataType, _Float16>)
-    {
-        return 0.0009765625; // numeric_limits<_Float16>::epsilon() => 0
-    }
-    else
-    {
-        return std::numeric_limits<DDataType>::epsilon();
-    }
-};
-
 template <typename DDataType>
 std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
                                                  DDataType*  hostD,
@@ -288,7 +276,7 @@ std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
     auto toDouble
         = [](DDataType const& val) { return static_cast<double>(static_cast<float>(val)); };
 
-    auto eps = getEpsilon<DDataType>();
+    auto eps = toDouble(std::numeric_limits<DDataType>::epsilon());
     if(isNaN)
     {
         retval           = false;