From c5fbcec9afdfe109b1ba1a15d74d819beec2e6fe Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Wed, 22 Nov 2023 16:21:33 +0000
Subject: [PATCH 01/12] Add support to f16 and bf16 to contraction

- Support _Float16
- Support hip_bfloat16
- Add unit test of _Float16 and hip_bfloat16
- Add sample of _Float16 and hip_bfloat16
---
 .../hiptensor/internal}/config.hpp            |   0
 .../hiptensor/internal/hiptensor_utility.hpp  |   1 +
 .../hiptensor/internal}/native_types.hpp      |   0
 .../hiptensor/internal}/native_types_impl.hpp |   0
 .../hiptensor/internal}/type_traits.hpp       |   0
 .../hiptensor/internal}/types.hpp             |   0
 .../hiptensor/internal}/types_ext.hpp         |   0
 .../hiptensor/internal}/xfloat32.hpp          |   0
 .../contraction_cpu_reference_impl.hpp        |  19 +-
 .../contraction_cpu_reference_instances.cpp   |  56 +++
 .../contraction/contraction_meta_traits.hpp   |  47 ++-
 .../src/contraction/contraction_selection.cpp | 280 +++++++++++++-
 .../src/contraction/contraction_solution.hpp  |   3 +-
 .../contraction/contraction_solution_impl.hpp |   6 +-
 .../contraction_solution_instances.cpp        |  57 +++
 library/src/contraction/device/CMakeLists.txt |  56 ++-
 ...16_bf16_bf16_compute_f32_kknn_instance.cpp |  62 ++++
 ...16_bf16_bf16_compute_f32_knnn_instance.cpp |  62 ++++
 ...16_bf16_bf16_compute_f32_mknn_instance.cpp |  62 ++++
 ...16_bf16_bf16_compute_f32_mnnn_instance.cpp |  62 ++++
 ..._f16_f16_f16_compute_f32_kknn_instance.cpp |  62 ++++
 ..._f16_f16_f16_compute_f32_knnn_instance.cpp |  62 ++++
 ..._f16_f16_f16_compute_f32_mknn_instance.cpp |  62 ++++
 ..._f16_f16_f16_compute_f32_mnnn_instance.cpp |  62 ++++
 ...f16_bf16_bf16_compute_f32_kkn_instance.cpp |  62 ++++
 ...f16_bf16_bf16_compute_f32_knn_instance.cpp |  62 ++++
 ...f16_bf16_bf16_compute_f32_mkn_instance.cpp |  62 ++++
 ...f16_bf16_bf16_compute_f32_mnn_instance.cpp |  62 ++++
 ...e_f16_f16_f16_compute_f32_kkn_instance.cpp |  62 ++++
 ...e_f16_f16_f16_compute_f32_knn_instance.cpp |  62 ++++
 ...e_f16_f16_f16_compute_f32_mkn_instance.cpp |  62 ++++
 ...e_f16_f16_f16_compute_f32_mnn_instance.cpp |  62 ++++
 ...hpp => hiptensor_contraction_bilinear.hpp} |   0
 ...le.hpp => hiptensor_contraction_scale.hpp} |   0
 .../src/contraction/hiptensor_contraction.cpp |  11 -
 library/src/hiptensor.cpp                     |   3 +-
 samples/01_contraction/CMakeLists.txt         |  15 +
 .../simple_bilinear_contraction_bf16.cpp      | 342 ++++++++++++++++++
 .../simple_bilinear_contraction_f16.cpp       | 342 ++++++++++++++++++
 .../simple_scale_contraction_bf16.cpp         | 334 +++++++++++++++++
 .../simple_scale_contraction_f16.cpp          | 334 +++++++++++++++++
 .../configs/bilinear_test_params.yaml         |   2 +
 .../configs/scale_test_params.yaml            |   2 +
 test/01_contraction/contraction_test.cpp      | 109 +++++-
 test/device/common.hpp                        |   2 +-
 test/llvm/yaml_parser_config.cpp              |   1 +
 test/utils.hpp                                |   2 +-
 47 files changed, 2944 insertions(+), 72 deletions(-)
 rename library/{src/include => include/hiptensor/internal}/config.hpp (100%)
 rename library/{src/include => include/hiptensor/internal}/native_types.hpp (100%)
 rename library/{src/include => include/hiptensor/internal}/native_types_impl.hpp (100%)
 rename library/{src/include => include/hiptensor/internal}/type_traits.hpp (100%)
 rename library/{src/include => include/hiptensor/internal}/types.hpp (100%)
 rename library/{src/include => include/hiptensor/internal}/types_ext.hpp (100%)
 rename library/{src/include => include/hiptensor/internal}/xfloat32.hpp (100%)
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
 rename library/src/contraction/device/{contraction_bilinear.hpp => hiptensor_contraction_bilinear.hpp} (100%)
 rename library/src/contraction/device/{contraction_scale.hpp => hiptensor_contraction_scale.hpp} (100%)
 create mode 100644 samples/01_contraction/simple_bilinear_contraction_bf16.cpp
 create mode 100644 samples/01_contraction/simple_bilinear_contraction_f16.cpp
 create mode 100644 samples/01_contraction/simple_scale_contraction_bf16.cpp
 create mode 100644 samples/01_contraction/simple_scale_contraction_f16.cpp

diff --git a/library/src/include/config.hpp b/library/include/hiptensor/internal/config.hpp
similarity index 100%
rename from library/src/include/config.hpp
rename to library/include/hiptensor/internal/config.hpp
diff --git a/library/include/hiptensor/internal/hiptensor_utility.hpp b/library/include/hiptensor/internal/hiptensor_utility.hpp
index f2df2dd2..c386bbe0 100644
--- a/library/include/hiptensor/internal/hiptensor_utility.hpp
+++ b/library/include/hiptensor/internal/hiptensor_utility.hpp
@@ -31,6 +31,7 @@
 #include <iostream>
 
 #include "../hiptensor_types.hpp"
+#include "types_ext.hpp"
 
 #ifndef CHECK_HIP_ERROR
 #define CHECK_HIP_ERROR(expression)                      \
diff --git a/library/src/include/native_types.hpp b/library/include/hiptensor/internal/native_types.hpp
similarity index 100%
rename from library/src/include/native_types.hpp
rename to library/include/hiptensor/internal/native_types.hpp
diff --git a/library/src/include/native_types_impl.hpp b/library/include/hiptensor/internal/native_types_impl.hpp
similarity index 100%
rename from library/src/include/native_types_impl.hpp
rename to library/include/hiptensor/internal/native_types_impl.hpp
diff --git a/library/src/include/type_traits.hpp b/library/include/hiptensor/internal/type_traits.hpp
similarity index 100%
rename from library/src/include/type_traits.hpp
rename to library/include/hiptensor/internal/type_traits.hpp
diff --git a/library/src/include/types.hpp b/library/include/hiptensor/internal/types.hpp
similarity index 100%
rename from library/src/include/types.hpp
rename to library/include/hiptensor/internal/types.hpp
diff --git a/library/src/include/types_ext.hpp b/library/include/hiptensor/internal/types_ext.hpp
similarity index 100%
rename from library/src/include/types_ext.hpp
rename to library/include/hiptensor/internal/types_ext.hpp
diff --git a/library/src/include/xfloat32.hpp b/library/include/hiptensor/internal/xfloat32.hpp
similarity index 100%
rename from library/src/include/xfloat32.hpp
rename to library/include/hiptensor/internal/xfloat32.hpp
diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp
index 673f6dff..ac4fc20d 100644
--- a/library/src/contraction/contraction_cpu_reference_impl.hpp
+++ b/library/src/contraction/contraction_cpu_reference_impl.hpp
@@ -57,6 +57,7 @@ namespace hiptensor
         typename AElementwiseOperation,
         typename BElementwiseOperation,
         typename CDEElementwiseOperation,
+        typename ComputeDataType = ADataType,
         ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2 && DsDataType::Size() <= 1,
                         bool>
         = false>
@@ -70,7 +71,8 @@ namespace hiptensor
                                                                           EDataType,
                                                                           AElementwiseOperation,
                                                                           BElementwiseOperation,
-                                                                          CDEElementwiseOperation>
+                                                                          CDEElementwiseOperation,
+                                                                          ComputeDataType>
     {
         using BaseArgument = ck::tensor_operation::device::BaseArgument;
         using BaseInvoker  = ck::tensor_operation::device::BaseInvoker;
@@ -324,7 +326,8 @@ namespace hiptensor
               typename AccumDataType,
               typename AElementwiseOperation,
               typename BElementwiseOperation,
-              typename CDEElementwiseOperation>
+              typename CDEElementwiseOperation,
+              typename ComputeDataType>
     struct MetaTraits<ReferenceContraction_M2_N2_K2<NumDimsM,
                                                     NumDimsN,
                                                     NumDimsK,
@@ -335,7 +338,8 @@ namespace hiptensor
                                                     AccumDataType,
                                                     AElementwiseOperation,
                                                     BElementwiseOperation,
-                                                    CDEElementwiseOperation>>
+                                                    CDEElementwiseOperation,
+                                                    ComputeDataType>>
         : public MetaTraits<
               ck::tensor_operation::device::DeviceContractionMultipleD<NumDimsM,
                                                                        NumDimsN,
@@ -346,7 +350,8 @@ namespace hiptensor
                                                                        EDataType,
                                                                        AElementwiseOperation,
                                                                        BElementwiseOperation,
-                                                                       CDEElementwiseOperation>>
+                                                                       CDEElementwiseOperation,
+                                                                       ComputeDataType>>
     {
     };
 
@@ -359,7 +364,8 @@ namespace hiptensor
               typename EDataType,
               typename AElementwiseOperation,
               typename BElementwiseOperation,
-              typename CDEElementwiseOperation>
+              typename CDEElementwiseOperation,
+              typename ComputeDataType = ADataType>
     auto enumerateReferenceSolutions()
     {
         using ReferenceOp = ReferenceContraction_M2_N2_K2<NumDimM,
@@ -372,7 +378,8 @@ namespace hiptensor
                                                           EDataType,
                                                           AElementwiseOperation,
                                                           BElementwiseOperation,
-                                                          CDEElementwiseOperation>;
+                                                          CDEElementwiseOperation,
+                                                          ComputeDataType>;
 
         auto solution = std::make_unique<ContractionSolutionImpl<ReferenceOp>>(
             std::make_unique<ReferenceOp>());
diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp
index 106dd5ff..146d2721 100644
--- a/library/src/contraction/contraction_cpu_reference_instances.cpp
+++ b/library/src/contraction/contraction_cpu_reference_instances.cpp
@@ -32,6 +32,34 @@ namespace hiptensor
     ContractionCpuReferenceInstances::ContractionCpuReferenceInstances()
     {
         // Register all the solutions exactly once
+        // Bilinear f16
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        _Float16,
+                                        _Float16,
+                                        ck::Tuple<_Float16>,
+                                        _Float16,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        float>());
+
+        // Bilinear bf16
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        ck::bhalf_t,
+                                        ck::bhalf_t,
+                                        ck::Tuple<ck::bhalf_t>,
+                                        ck::bhalf_t,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        float>());
+
         // Bilinear f32
         registerSolutions(
             enumerateReferenceSolutions<2,
@@ -58,6 +86,34 @@ namespace hiptensor
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::Bilinear>());
 
+        // Scale f16
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        _Float16,
+                                        _Float16,
+                                        ck::Tuple<>,
+                                        _Float16,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        float>());
+
+        // Scale bf16
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        ck::bhalf_t,
+                                        ck::bhalf_t,
+                                        ck::Tuple<>,
+                                        ck::bhalf_t,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        float>());
+
         // Scale f32
         registerSolutions(
             enumerateReferenceSolutions<2,
diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp
index 4fa7acf7..ab158f96 100644
--- a/library/src/contraction/contraction_meta_traits.hpp
+++ b/library/src/contraction/contraction_meta_traits.hpp
@@ -49,7 +49,8 @@ namespace hiptensor
               typename DsDataType,
               typename EDataType,
               typename AElementwiseOperation,
-              typename BElementwiseOperation>
+              typename BElementwiseOperation,
+              typename ComputeDataType>
     struct MetaTraits<ck::tensor_operation::device::DeviceContractionMultipleD<
         NumDimsM,
         NumDimsN,
@@ -60,18 +61,23 @@ namespace hiptensor
         EDataType,
         AElementwiseOperation,
         BElementwiseOperation,
-        ck::tensor_operation::element_wise::Bilinear>>
+        ck::tensor_operation::element_wise::Bilinear,
+        ComputeDataType>>
     {
         constexpr static ck::index_t DimsM = NumDimsM;
         constexpr static ck::index_t DimsN = NumDimsN;
         constexpr static ck::index_t DimsK = NumDimsK;
-        using ADataT                       = ADataType;
-        using BDataT                       = BDataType;
-        using DDataT                       = DsDataType;
-        using EDataT                       = EDataType;
-        using AOp                          = AElementwiseOperation;
-        using BOp                          = BElementwiseOperation;
-        using CDEOp                        = ck::tensor_operation::element_wise::Bilinear;
+        using ADataT
+            = std::conditional_t<std::is_same_v<ADataType, ck::bhalf_t>, hip_bfloat16, ADataType>;
+        using BDataT
+            = std::conditional_t<std::is_same_v<BDataType, ck::bhalf_t>, hip_bfloat16, BDataType>;
+        using DDataT
+            = std::conditional_t<std::is_same_v<DsDataType, ck::bhalf_t>, hip_bfloat16, DsDataType>;
+        using EDataT
+            = std::conditional_t<std::is_same_v<EDataType, ck::bhalf_t>, hip_bfloat16, EDataType>;
+        using AOp   = AElementwiseOperation;
+        using BOp   = BElementwiseOperation;
+        using CDEOp = ck::tensor_operation::element_wise::Bilinear;
     };
 
     // Partial specialize for Scale contraction
@@ -82,7 +88,8 @@ namespace hiptensor
               typename BDataType,
               typename EDataType,
               typename AElementwiseOperation,
-              typename BElementwiseOperation>
+              typename BElementwiseOperation,
+              typename ComputeDataType>
     struct MetaTraits<ck::tensor_operation::device::DeviceContractionMultipleD<
         NumDimsM,
         NumDimsN,
@@ -93,18 +100,22 @@ namespace hiptensor
         EDataType,
         AElementwiseOperation,
         BElementwiseOperation,
-        ck::tensor_operation::element_wise::Scale>>
+        ck::tensor_operation::element_wise::Scale,
+        ComputeDataType>>
     {
         constexpr static ck::index_t DimsM = NumDimsM;
         constexpr static ck::index_t DimsN = NumDimsN;
         constexpr static ck::index_t DimsK = NumDimsK;
-        using ADataT                       = ADataType;
-        using BDataT                       = BDataType;
-        using DDataT                       = NoneType;
-        using EDataT                       = EDataType;
-        using AOp                          = AElementwiseOperation;
-        using BOp                          = BElementwiseOperation;
-        using CDEOp                        = ck::tensor_operation::element_wise::Scale;
+        using ADataT
+            = std::conditional_t<std::is_same_v<ADataType, ck::bhalf_t>, hip_bfloat16, ADataType>;
+        using BDataT
+            = std::conditional_t<std::is_same_v<BDataType, ck::bhalf_t>, hip_bfloat16, BDataType>;
+        using DDataT = NoneType;
+        using EDataT
+            = std::conditional_t<std::is_same_v<EDataType, ck::bhalf_t>, hip_bfloat16, EDataType>;
+        using AOp   = AElementwiseOperation;
+        using BOp   = BElementwiseOperation;
+        using CDEOp = ck::tensor_operation::element_wise::Scale;
     };
 
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp
index aaa624f6..1b2cf92e 100644
--- a/library/src/contraction/contraction_selection.cpp
+++ b/library/src/contraction/contraction_selection.cpp
@@ -150,6 +150,192 @@ namespace hiptensor
         }
     }
 
+    // test
+    template <>
+    struct ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, ContractionOpId_t::SCALE>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            // TODO select unique_id
+            unique_id = 7255639152084218514;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, ContractionOpId_t::BILINEAR>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            // select unique_id
+            unique_id = 7255639152084218514;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<hip_bfloat16,
+                                hip_bfloat16,
+                                hip_bfloat16,
+                                hip_bfloat16,
+                                ContractionOpId_t::SCALE>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            // TODO select unique_id
+            unique_id = 8689089455041651212;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<hip_bfloat16,
+                                hip_bfloat16,
+                                hip_bfloat16,
+                                hip_bfloat16,
+                                ContractionOpId_t::BILINEAR>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            // select unique_id
+            unique_id = 8689089455041651212;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+    // end test
+
     template <>
     struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE>
     {
@@ -1418,7 +1604,99 @@ namespace hiptensor
                          std::vector<std::size_t> const&                         e_ms_ns_strides,
                          const uint64_t                                          workspaceSize)
     {
-        if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F)
+        if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == NONE_TYPE && typeE == HIP_R_16F)
+        {
+            return ActorCriticSelection<_Float16,
+                                        _Float16,
+                                        _Float16,
+                                        _Float16,
+                                        ContractionOpId_t::SCALE>::selectWinner(winner,
+                                                                                candidates,
+                                                                                typeA,
+                                                                                a_ms_ks_lengths,
+                                                                                a_ms_ks_strides,
+                                                                                typeB,
+                                                                                b_ns_ks_lengths,
+                                                                                b_ns_ks_strides,
+                                                                                typeD,
+                                                                                d_ms_ns_lengths,
+                                                                                d_ms_ns_strides,
+                                                                                typeE,
+                                                                                e_ms_ns_lengths,
+                                                                                e_ms_ns_strides,
+                                                                                workspaceSize);
+        }
+        else if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == HIP_R_16F
+                && typeE == HIP_R_16F)
+        {
+            return ActorCriticSelection<_Float16,
+                                        _Float16,
+                                        _Float16,
+                                        _Float16,
+                                        ContractionOpId_t::BILINEAR>::selectWinner(winner,
+                                                                                   candidates,
+                                                                                   typeA,
+                                                                                   a_ms_ks_lengths,
+                                                                                   a_ms_ks_strides,
+                                                                                   typeB,
+                                                                                   b_ns_ks_lengths,
+                                                                                   b_ns_ks_strides,
+                                                                                   typeD,
+                                                                                   d_ms_ns_lengths,
+                                                                                   d_ms_ns_strides,
+                                                                                   typeE,
+                                                                                   e_ms_ns_lengths,
+                                                                                   e_ms_ns_strides,
+                                                                                   workspaceSize);
+        }
+        else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == NONE_TYPE
+                && typeE == HIP_R_16BF)
+        {
+            return ActorCriticSelection<hip_bfloat16,
+                                        hip_bfloat16,
+                                        hip_bfloat16,
+                                        hip_bfloat16,
+                                        ContractionOpId_t::SCALE>::selectWinner(winner,
+                                                                                candidates,
+                                                                                typeA,
+                                                                                a_ms_ks_lengths,
+                                                                                a_ms_ks_strides,
+                                                                                typeB,
+                                                                                b_ns_ks_lengths,
+                                                                                b_ns_ks_strides,
+                                                                                typeD,
+                                                                                d_ms_ns_lengths,
+                                                                                d_ms_ns_strides,
+                                                                                typeE,
+                                                                                e_ms_ns_lengths,
+                                                                                e_ms_ns_strides,
+                                                                                workspaceSize);
+        }
+        else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == HIP_R_16BF
+                && typeE == HIP_R_16BF)
+        {
+            return ActorCriticSelection<hip_bfloat16,
+                                        hip_bfloat16,
+                                        hip_bfloat16,
+                                        hip_bfloat16,
+                                        ContractionOpId_t::BILINEAR>::selectWinner(winner,
+                                                                                   candidates,
+                                                                                   typeA,
+                                                                                   a_ms_ks_lengths,
+                                                                                   a_ms_ks_strides,
+                                                                                   typeB,
+                                                                                   b_ns_ks_lengths,
+                                                                                   b_ns_ks_strides,
+                                                                                   typeD,
+                                                                                   d_ms_ns_lengths,
+                                                                                   d_ms_ns_strides,
+                                                                                   typeE,
+                                                                                   e_ms_ns_lengths,
+                                                                                   e_ms_ns_strides,
+                                                                                   workspaceSize);
+        }
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE
+                && typeE == HIP_R_32F)
         {
             return ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE>::
                 selectWinner(winner,
diff --git a/library/src/contraction/contraction_solution.hpp b/library/src/contraction/contraction_solution.hpp
index 0037584e..e76bb351 100644
--- a/library/src/contraction/contraction_solution.hpp
+++ b/library/src/contraction/contraction_solution.hpp
@@ -147,7 +147,8 @@ namespace hiptensor
               typename EDataType,
               typename AElementwiseOperation,
               typename BElementwiseOperation,
-              typename CDEElementwiseOperation>
+              typename CDEElementwiseOperation,
+              typename ComputeDataType>
     std::vector<std::unique_ptr<hiptensor::ContractionSolution>> enumerateContractionSolutions();
 
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_solution_impl.hpp b/library/src/contraction/contraction_solution_impl.hpp
index 0fb5df9d..5e191441 100644
--- a/library/src/contraction/contraction_solution_impl.hpp
+++ b/library/src/contraction/contraction_solution_impl.hpp
@@ -274,7 +274,8 @@ namespace hiptensor
               typename EDataType,
               typename AElementwiseOperation,
               typename BElementwiseOperation,
-              typename CDEElementwiseOperation>
+              typename CDEElementwiseOperation,
+              typename ComputeDataType = ADataType>
     std::vector<std::unique_ptr<hiptensor::ContractionSolution>> enumerateContractionSolutions()
     {
         using ContractionOp
@@ -287,7 +288,8 @@ namespace hiptensor
                                                                        EDataType,
                                                                        AElementwiseOperation,
                                                                        BElementwiseOperation,
-                                                                       CDEElementwiseOperation>;
+                                                                       CDEElementwiseOperation,
+                                                                       ComputeDataType>;
 
         using Factory
             = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<ContractionOp>;
diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp
index fd263a8b..6d481577 100644
--- a/library/src/contraction/contraction_solution_instances.cpp
+++ b/library/src/contraction/contraction_solution_instances.cpp
@@ -32,6 +32,35 @@ namespace hiptensor
     ContractionSolutionInstances::ContractionSolutionInstances()
     {
         // Register all the solutions exactly once
+
+        // Bilinear bf16
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          ck::bhalf_t,
+                                          ck::bhalf_t,
+                                          ck::Tuple<ck::bhalf_t>,
+                                          ck::bhalf_t,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          float>());
+
+        // Bilinear f16
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          ck::half_t,
+                                          ck::half_t,
+                                          ck::Tuple<ck::half_t>,
+                                          ck::half_t,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          float>());
+
         // Bilinear f32
         registerSolutions(
             enumerateContractionSolutions<2,
@@ -58,6 +87,34 @@ namespace hiptensor
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::Bilinear>());
 
+        // Scale bf16
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          ck::bhalf_t,
+                                          ck::bhalf_t,
+                                          ck::Tuple<>,
+                                          ck::bhalf_t,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Scale,
+                                          float>());
+
+        // Scale f16
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          ck::half_t,
+                                          ck::half_t,
+                                          ck::Tuple<>,
+                                          ck::half_t,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Scale,
+                                          float>());
+
         // Scale f32
         registerSolutions(
             enumerateContractionSolutions<2,
diff --git a/library/src/contraction/device/CMakeLists.txt b/library/src/contraction/device/CMakeLists.txt
index f2e4a0fb..b9b382c0 100644
--- a/library/src/contraction/device/CMakeLists.txt
+++ b/library/src/contraction/device/CMakeLists.txt
@@ -24,24 +24,40 @@
  #
  ###############################################################################
 
-set(CK_CONTRACTION_INSTANCE_SOURCES
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
-)
+ set(CK_CONTRACTION_INSTANCE_SOURCES
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
+     )
 
-add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES})
-target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES})
+ add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES})
+ target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES})
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
new file mode 100644
index 00000000..7d777a83
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance
+                    = device_contraction_kk_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     BF16_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               BF16_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
new file mode 100644
index 00000000..a9a97148
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance
+                    = device_contraction_kn_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     BF16_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               BF16_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
new file mode 100644
index 00000000..d83d8d16
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance
+                    = device_contraction_mk_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     BF16_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               BF16_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
new file mode 100644
index 00000000..bc49c82b
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance
+                    = device_contraction_mn_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     BF16_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               BF16_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
new file mode 100644
index 00000000..a9d963ab
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance
+                    = device_contraction_kk_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     F16_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               F16_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
new file mode 100644
index 00000000..c139942e
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance
+                    = device_contraction_kn_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     F16_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               F16_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
new file mode 100644
index 00000000..3c6ced30
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance
+                    = device_contraction_mk_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     F16_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               F16_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
new file mode 100644
index 00000000..33c66296
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance
+                    = device_contraction_mn_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     F16_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               F16_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
new file mode 100644
index 00000000..05400151
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance
+                    = device_contraction_kk_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     Empty_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               Empty_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
new file mode 100644
index 00000000..bba95b14
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance
+                    = device_contraction_kn_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     Empty_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               Empty_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
new file mode 100644
index 00000000..fb5ecec0
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance
+                    = device_contraction_mk_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     Empty_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               Empty_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
new file mode 100644
index 00000000..1dd6613c
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance
+                    = device_contraction_mn_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     Empty_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               Empty_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
new file mode 100644
index 00000000..e98aee20
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance
+                    = device_contraction_kk_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     Empty_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               Empty_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
new file mode 100644
index 00000000..db8de1c0
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance
+                    = device_contraction_kn_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     Empty_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               Empty_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
new file mode 100644
index 00000000..397ef327
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance
+                    = device_contraction_mk_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     Empty_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               Empty_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
new file mode 100644
index 00000000..1f9221dc
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance
+                    = device_contraction_mn_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     Empty_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               Empty_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/contraction_bilinear.hpp b/library/src/contraction/device/hiptensor_contraction_bilinear.hpp
similarity index 100%
rename from library/src/contraction/device/contraction_bilinear.hpp
rename to library/src/contraction/device/hiptensor_contraction_bilinear.hpp
diff --git a/library/src/contraction/device/contraction_scale.hpp b/library/src/contraction/device/hiptensor_contraction_scale.hpp
similarity index 100%
rename from library/src/contraction/device/contraction_scale.hpp
rename to library/src/contraction/device/hiptensor_contraction_scale.hpp
diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp
index 09f5ddf6..b96a204e 100644
--- a/library/src/contraction/hiptensor_contraction.cpp
+++ b/library/src/contraction/hiptensor_contraction.cpp
@@ -708,17 +708,6 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         return errorCode;
     }
 
-    if(plan->mContractionDesc.mComputeType != plan->mContractionDesc.mTensorDesc[3].mType)
-    {
-        auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE;
-        snprintf(msg,
-                 sizeof(msg),
-                 "Internal Error : compute type != D type (%s)",
-                 hiptensorGetErrorString(errorCode));
-        logger->logError("hiptensorContraction", msg);
-        return errorCode;
-    }
-
     auto* cSolution = (hiptensor::ContractionSolution*)(plan->mSolution);
 
     auto canRun = cSolution->initArgs(alpha,
diff --git a/library/src/hiptensor.cpp b/library/src/hiptensor.cpp
index 9740d2a8..51af1f48 100644
--- a/library/src/hiptensor.cpp
+++ b/library/src/hiptensor.cpp
@@ -152,7 +152,8 @@ hiptensorStatus_t hiptensorInitTensorDescriptor(const hiptensorHandle_t*     han
     }
 
     if((lens == nullptr)
-       || ((dataType != HIP_R_16F) && (dataType != HIP_R_32F) && (dataType != HIP_R_64F))
+       || ((dataType != HIP_R_16F) && (dataType != HIP_R_16BF) && (dataType != HIP_R_32F)
+           && (dataType != HIP_R_64F))
        || unaryOp != HIPTENSOR_OP_IDENTITY)
     {
         auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE;
diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt
index ada3ce61..15972d60 100644
--- a/samples/01_contraction/CMakeLists.txt
+++ b/samples/01_contraction/CMakeLists.txt
@@ -26,15 +26,30 @@
 
 # Check whether building within hiptensor context
 if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
+    add_hiptensor_sample(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp)
+    add_hiptensor_sample(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp)
     add_hiptensor_sample(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp)
+    add_hiptensor_sample(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp)
+    add_hiptensor_sample(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp)
     add_hiptensor_sample(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp)
 
 # If building hipTensor samples as a standalone Cmake project
 else()
+    add_executable(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp)
+    target_link_libraries(simple_contraction_scale_f16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp)
+    target_link_libraries(simple_contraction_scale_bf16 PRIVATE hiptensor::hiptensor)
 
     add_executable(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp)
     target_link_libraries(simple_contraction_scale_f32 PRIVATE hiptensor::hiptensor)
 
+    add_executable(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp)
+    target_link_libraries(simple_contraction_bilinear_f16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp)
+    target_link_libraries(simple_contraction_bilinear_bf16 PRIVATE hiptensor::hiptensor)
+
     add_executable(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp)
     target_link_libraries(simple_contraction_bilinear_f32 PRIVATE hiptensor::hiptensor)
 
diff --git a/samples/01_contraction/simple_bilinear_contraction_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_bf16.cpp
new file mode 100644
index 00000000..0a4a9314
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_bf16.cpp
@@ -0,0 +1,342 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include <algorithm>
+#include <fstream>
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
+#include <hiptensor/internal/hiptensor_utility.hpp>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <unordered_map>
+
+#include "common.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef hip_bfloat16 ADataType;
+    typedef hip_bfloat16 BDataType;
+    typedef hip_bfloat16 CDataType;
+    typedef float        floatTypeCompute;
+
+    hipDataType            typeA       = HIP_R_16BF;
+    hipDataType            typeB       = HIP_R_16BF;
+    hipDataType            typeC       = HIP_R_16BF;
+    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    floatTypeCompute alpha = (floatTypeCompute)1.1f;
+    floatTypeCompute beta  = (floatTypeCompute)1.0f;
+
+    /**********************
+   * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta *
+   *C_{m,n,u,v}
+   **********************/
+
+    std::vector<int> modeC{'m', 'n', 'u', 'v'};
+    std::vector<int> modeA{'m', 'n', 'h', 'k'};
+    std::vector<int> modeB{'u', 'v', 'h', 'k'};
+
+    int nmodeA = modeA.size();
+    int nmodeB = modeB.size();
+    int nmodeC = modeC.size();
+
+    std::unordered_map<int, int64_t> extent;
+
+    extent['m'] = 5;
+    extent['n'] = 6;
+    extent['u'] = 3;
+    extent['v'] = 4;
+    extent['h'] = 3;
+    extent['k'] = 4;
+
+    std::vector<int64_t> c_ms_ns_lengths;
+    for(auto mode : modeC)
+    {
+        c_ms_ns_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> a_ms_ks_lengths;
+    for(auto mode : modeA)
+    {
+        a_ms_ks_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> b_ns_ks_lengths;
+    for(auto mode : modeB)
+    {
+        b_ns_ks_lengths.push_back(extent[mode]);
+    }
+
+    hiptensorHandle_t* handle;
+    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
+
+    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
+
+    /********************************************
+   * Initialize tensors with the input lengths *
+   ********************************************/
+    hiptensorTensorDescriptor_t a_ms_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &a_ms_ks,
+                                                        nmodeA,
+                                                        a_ms_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeA,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t b_ns_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &b_ns_ks,
+                                                        nmodeB,
+                                                        b_ns_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeB,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t c_ms_ns;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &c_ms_ns,
+                                                        nmodeC,
+                                                        c_ms_ns_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeC,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    /**********************
+   * Allocating data
+   **********************/
+    std::cout << "Initializing host data..." << std::endl;
+
+    size_t elementsA = std::accumulate(
+        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsB = std::accumulate(
+        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsC = std::accumulate(
+        c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
+
+    size_t sizeA = sizeof(ADataType) * elementsA;
+    size_t sizeB = sizeof(BDataType) * elementsB;
+    size_t sizeC = sizeof(CDataType) * elementsC;
+
+    ADataType* A = nullptr;
+    BDataType* B = nullptr;
+    CDataType* C = nullptr;
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC));
+
+    void *A_d, *B_d, *C_d;
+
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&C_d), sizeC));
+
+    /*******************
+   * Initialize data
+   *******************/
+    for(int64_t i = 0; i < elementsA; i++)
+    {
+        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    for(int64_t i = 0; i < elementsB; i++)
+    {
+        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    for(int64_t i = 0; i < elementsC; i++)
+    {
+        C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    /********************************************
+   * Transfer the Host Tensor to Device Memory *
+   ********************************************/
+    std::cout << "Initializing device data..." << std::endl;
+
+    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast<const void*>(C), sizeC, hipMemcpyHostToDevice));
+
+    /************************************************
+   * Retrieve the memory alignment for each tensor
+   ************************************************/
+
+    uint32_t alignmentRequirementA;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
+
+    uint32_t alignmentRequirementB;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
+
+    uint32_t alignmentRequirementC;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC));
+
+    /*******************************
+   * Create Contraction Descriptor
+   *******************************/
+
+    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
+    std::cout << "c_ms_ns: " << c_ms_ns << std::endl;
+
+    hiptensorContractionDescriptor_t desc;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
+                                                             &desc,
+                                                             &a_ms_ks,
+                                                             modeA.data(),
+                                                             alignmentRequirementA,
+                                                             &b_ns_ks,
+                                                             modeB.data(),
+                                                             alignmentRequirementB,
+                                                             &c_ms_ns,
+                                                             modeC.data(),
+                                                             alignmentRequirementC,
+                                                             &c_ms_ns,
+                                                             modeC.data(),
+                                                             alignmentRequirementC,
+                                                             typeCompute));
+    /**************************
+   * Set the algorithm to use
+   ***************************/
+
+    hiptensorContractionFind_t find;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
+
+    /**********************
+   * Query workspace
+   **********************/
+
+    uint64_t worksize = 0;
+    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
+        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
+
+    void* workspace = nullptr;
+
+    if(worksize > 0)
+    {
+        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
+    }
+
+    /**************************
+   * Create Contraction Plan
+   **************************/
+    std::cout << "Initializing contraction plan..." << std::endl;
+
+    hiptensorContractionPlan_t plan;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
+
+    std::cout << "Launching contraction kernel..." << std::endl;
+
+    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
+                                               &plan,
+                                               (void*)&alpha,
+                                               A_d,
+                                               B_d,
+                                               (void*)&beta,
+                                               C_d,
+                                               C_d,
+                                               workspace,
+                                               worksize,
+                                               0 /* stream */));
+
+#if !NDEBUG
+    bool printElements = false;
+    bool storeElements = false;
+
+    if(printElements || storeElements)
+    {
+        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
+    }
+
+    if(printElements)
+    {
+        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor A elements:\n";
+            hiptensorPrintArrayElements(std::cout, A, elementsA);
+            std::cout << std::endl;
+        }
+
+        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor B elements:\n";
+            hiptensorPrintArrayElements(std::cout, B, elementsB);
+            std::cout << std::endl;
+        }
+
+        if(elementsC < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor C elements:\n";
+            hiptensorPrintArrayElements(std::cout, C, elementsC);
+            std::cout << std::endl;
+        }
+    }
+
+    if(storeElements)
+    {
+        std::ofstream tensorA, tensorB, tensorC;
+        tensorA.open("tensor_A.txt");
+        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
+        tensorA.close();
+
+        tensorB.open("tensor_B.txt");
+        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
+        tensorB.close();
+
+        tensorC.open("tensor_C_scale_contraction_results.txt");
+        hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
+        tensorC.close();
+    }
+
+#endif
+
+    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
+
+    HIPTENSOR_FREE_HOST(A);
+    HIPTENSOR_FREE_HOST(B);
+    HIPTENSOR_FREE_HOST(C);
+
+    HIPTENSOR_FREE_DEVICE(A_d);
+    HIPTENSOR_FREE_DEVICE(B_d);
+    HIPTENSOR_FREE_DEVICE(C_d);
+    HIPTENSOR_FREE_DEVICE(workspace);
+
+    std::cout << "Finished!" << std::endl;
+
+    return 0;
+}
diff --git a/samples/01_contraction/simple_bilinear_contraction_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f16.cpp
new file mode 100644
index 00000000..d9d044c9
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_f16.cpp
@@ -0,0 +1,342 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include <algorithm>
+#include <fstream>
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
+#include <hiptensor/internal/hiptensor_utility.hpp>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <unordered_map>
+
+#include "common.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef _Float16 ADataType;
+    typedef _Float16 BDataType;
+    typedef _Float16 CDataType;
+    typedef float    floatTypeCompute;
+
+    hipDataType            typeA       = HIP_R_16F;
+    hipDataType            typeB       = HIP_R_16F;
+    hipDataType            typeC       = HIP_R_16F;
+    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    floatTypeCompute alpha = (floatTypeCompute)1.1f;
+    floatTypeCompute beta  = (floatTypeCompute)1.0f;
+
+    /**********************
+   * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta *
+   *C_{m,n,u,v}
+   **********************/
+
+    std::vector<int> modeC{'m', 'n', 'u', 'v'};
+    std::vector<int> modeA{'m', 'n', 'h', 'k'};
+    std::vector<int> modeB{'u', 'v', 'h', 'k'};
+
+    int nmodeA = modeA.size();
+    int nmodeB = modeB.size();
+    int nmodeC = modeC.size();
+
+    std::unordered_map<int, int64_t> extent;
+
+    extent['m'] = 5;
+    extent['n'] = 6;
+    extent['u'] = 3;
+    extent['v'] = 4;
+    extent['h'] = 3;
+    extent['k'] = 4;
+
+    std::vector<int64_t> c_ms_ns_lengths;
+    for(auto mode : modeC)
+    {
+        c_ms_ns_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> a_ms_ks_lengths;
+    for(auto mode : modeA)
+    {
+        a_ms_ks_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> b_ns_ks_lengths;
+    for(auto mode : modeB)
+    {
+        b_ns_ks_lengths.push_back(extent[mode]);
+    }
+
+    hiptensorHandle_t* handle;
+    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
+
+    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
+
+    /********************************************
+   * Initialize tensors with the input lengths *
+   ********************************************/
+    hiptensorTensorDescriptor_t a_ms_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &a_ms_ks,
+                                                        nmodeA,
+                                                        a_ms_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeA,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t b_ns_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &b_ns_ks,
+                                                        nmodeB,
+                                                        b_ns_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeB,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t c_ms_ns;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &c_ms_ns,
+                                                        nmodeC,
+                                                        c_ms_ns_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeC,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    /**********************
+   * Allocating data
+   **********************/
+    std::cout << "Initializing host data..." << std::endl;
+
+    size_t elementsA = std::accumulate(
+        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsB = std::accumulate(
+        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsC = std::accumulate(
+        c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
+
+    size_t sizeA = sizeof(ADataType) * elementsA;
+    size_t sizeB = sizeof(BDataType) * elementsB;
+    size_t sizeC = sizeof(CDataType) * elementsC;
+
+    ADataType* A = nullptr;
+    BDataType* B = nullptr;
+    CDataType* C = nullptr;
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC));
+
+    void *A_d, *B_d, *C_d;
+
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&C_d), sizeC));
+
+    /*******************
+   * Initialize data
+   *******************/
+    for(int64_t i = 0; i < elementsA; i++)
+    {
+        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    for(int64_t i = 0; i < elementsB; i++)
+    {
+        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    for(int64_t i = 0; i < elementsC; i++)
+    {
+        C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    /********************************************
+   * Transfer the Host Tensor to Device Memory *
+   ********************************************/
+    std::cout << "Initializing device data..." << std::endl;
+
+    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast<const void*>(C), sizeC, hipMemcpyHostToDevice));
+
+    /************************************************
+   * Retrieve the memory alignment for each tensor
+   ************************************************/
+
+    uint32_t alignmentRequirementA;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
+
+    uint32_t alignmentRequirementB;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
+
+    uint32_t alignmentRequirementC;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC));
+
+    /*******************************
+   * Create Contraction Descriptor
+   *******************************/
+
+    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
+    std::cout << "c_ms_ns: " << c_ms_ns << std::endl;
+
+    hiptensorContractionDescriptor_t desc;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
+                                                             &desc,
+                                                             &a_ms_ks,
+                                                             modeA.data(),
+                                                             alignmentRequirementA,
+                                                             &b_ns_ks,
+                                                             modeB.data(),
+                                                             alignmentRequirementB,
+                                                             &c_ms_ns,
+                                                             modeC.data(),
+                                                             alignmentRequirementC,
+                                                             &c_ms_ns,
+                                                             modeC.data(),
+                                                             alignmentRequirementC,
+                                                             typeCompute));
+    /**************************
+   * Set the algorithm to use
+   ***************************/
+
+    hiptensorContractionFind_t find;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
+
+    /**********************
+   * Query workspace
+   **********************/
+
+    uint64_t worksize = 0;
+    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
+        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
+
+    void* workspace = nullptr;
+
+    if(worksize > 0)
+    {
+        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
+    }
+
+    /**************************
+   * Create Contraction Plan
+   **************************/
+    std::cout << "Initializing contraction plan..." << std::endl;
+
+    hiptensorContractionPlan_t plan;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
+
+    std::cout << "Launching contraction kernel..." << std::endl;
+
+    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
+                                               &plan,
+                                               (void*)&alpha,
+                                               A_d,
+                                               B_d,
+                                               (void*)&beta,
+                                               C_d,
+                                               C_d,
+                                               workspace,
+                                               worksize,
+                                               0 /* stream */));
+
+#if !NDEBUG
+    bool printElements = false;
+    bool storeElements = false;
+
+    if(printElements || storeElements)
+    {
+        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
+    }
+
+    if(printElements)
+    {
+        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor A elements:\n";
+            hiptensorPrintArrayElements(std::cout, A, elementsA);
+            std::cout << std::endl;
+        }
+
+        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor B elements:\n";
+            hiptensorPrintArrayElements(std::cout, B, elementsB);
+            std::cout << std::endl;
+        }
+
+        if(elementsC < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor C elements:\n";
+            hiptensorPrintArrayElements(std::cout, C, elementsC);
+            std::cout << std::endl;
+        }
+    }
+
+    if(storeElements)
+    {
+        std::ofstream tensorA, tensorB, tensorC;
+        tensorA.open("tensor_A.txt");
+        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
+        tensorA.close();
+
+        tensorB.open("tensor_B.txt");
+        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
+        tensorB.close();
+
+        tensorC.open("tensor_C_scale_contraction_results.txt");
+        hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
+        tensorC.close();
+    }
+
+#endif
+
+    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
+
+    HIPTENSOR_FREE_HOST(A);
+    HIPTENSOR_FREE_HOST(B);
+    HIPTENSOR_FREE_HOST(C);
+
+    HIPTENSOR_FREE_DEVICE(A_d);
+    HIPTENSOR_FREE_DEVICE(B_d);
+    HIPTENSOR_FREE_DEVICE(C_d);
+    HIPTENSOR_FREE_DEVICE(workspace);
+
+    std::cout << "Finished!" << std::endl;
+
+    return 0;
+}
diff --git a/samples/01_contraction/simple_scale_contraction_bf16.cpp b/samples/01_contraction/simple_scale_contraction_bf16.cpp
new file mode 100644
index 00000000..e05916bf
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_bf16.cpp
@@ -0,0 +1,334 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include <algorithm>
+#include <fstream>
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
+#include <hiptensor/internal/hiptensor_utility.hpp>
+#include <iterator>
+#include <numeric>
+#include <unordered_map>
+
+#include "common.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    typedef hip_bfloat16 ADataType;
+    typedef hip_bfloat16 BDataType;
+    typedef hip_bfloat16 DDataType;
+    typedef float        floatTypeCompute;
+
+    hipDataType            typeA       = HIP_R_16BF;
+    hipDataType            typeB       = HIP_R_16BF;
+    hipDataType            typeD       = HIP_R_16BF;
+    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    floatTypeCompute alpha = (floatTypeCompute)1.0f;
+
+    /**********************
+   * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v}
+   **********************/
+
+    std::vector<int> modeD{'m', 'n', 'u', 'v'};
+    std::vector<int> modeA{'m', 'n', 'h', 'k'};
+    std::vector<int> modeB{'u', 'v', 'h', 'k'};
+
+    int nmodeA = modeA.size();
+    int nmodeB = modeB.size();
+    int nmodeD = modeD.size();
+
+    std::unordered_map<int, int64_t> extent;
+
+    extent['m'] = 5;
+    extent['n'] = 6;
+    extent['u'] = 3;
+    extent['v'] = 4;
+    extent['h'] = 3;
+    extent['k'] = 4;
+
+    std::vector<int64_t> d_ms_ns_lengths;
+    for(auto mode : modeD)
+    {
+        d_ms_ns_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> a_ms_ks_lengths;
+    for(auto mode : modeA)
+    {
+        a_ms_ks_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> b_ns_ks_lengths;
+    for(auto mode : modeB)
+    {
+        b_ns_ks_lengths.push_back(extent[mode]);
+    }
+
+    hiptensorHandle_t* handle;
+    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
+
+    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
+
+    /********************************************
+   * Initialize tensors with the input lengths *
+   ********************************************/
+    hiptensorTensorDescriptor_t a_ms_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &a_ms_ks,
+                                                        nmodeA,
+                                                        a_ms_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeA,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t b_ns_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &b_ns_ks,
+                                                        nmodeB,
+                                                        b_ns_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeB,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t d_ms_ns;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &d_ms_ns,
+                                                        nmodeD,
+                                                        d_ms_ns_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeD,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    /**********************
+   * Allocating data
+   **********************/
+    std::cout << "Initializing host data..." << std::endl;
+
+    size_t elementsA = std::accumulate(
+        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsB = std::accumulate(
+        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsD = std::accumulate(
+        d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
+
+    size_t sizeA = sizeof(ADataType) * elementsA;
+    size_t sizeB = sizeof(BDataType) * elementsB;
+    size_t sizeD = sizeof(DDataType) * elementsD;
+
+    ADataType* A = nullptr;
+    BDataType* B = nullptr;
+    DDataType* D = nullptr;
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD));
+
+    void *A_d, *B_d, *D_d;
+
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&D_d), sizeD));
+
+    /*******************
+   * Initialize data
+   *******************/
+    for(int64_t i = 0; i < elementsA; i++)
+    {
+        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    for(int64_t i = 0; i < elementsB; i++)
+    {
+        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    for(int64_t i = 0; i < elementsD; i++)
+    {
+        D[i] = std::numeric_limits<DDataType>::signaling_NaN();
+    }
+
+    /********************************************
+   * Transfer the Host Tensor to Device Memory *
+   ********************************************/
+    std::cout << "Initializing device data..." << std::endl;
+
+    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD));
+
+    /************************************************
+   * Retrieve the memory alignment for each tensor
+   ************************************************/
+    uint32_t alignmentRequirementA;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
+
+    uint32_t alignmentRequirementB;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
+
+    uint32_t alignmentRequirementD;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD));
+
+    /*******************************
+   * Create Contraction Descriptor
+   *******************************/
+
+    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
+    std::cout << "d_ms_ns: " << d_ms_ns << std::endl;
+
+    hiptensorContractionDescriptor_t desc;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
+                                                             &desc,
+                                                             &a_ms_ks,
+                                                             modeA.data(),
+                                                             alignmentRequirementA,
+                                                             &b_ns_ks,
+                                                             modeB.data(),
+                                                             alignmentRequirementB,
+                                                             nullptr,
+                                                             nullptr,
+                                                             0,
+                                                             &d_ms_ns,
+                                                             modeD.data(),
+                                                             alignmentRequirementD,
+                                                             typeCompute));
+    /**************************
+   * Set the algorithm to use
+   ***************************/
+
+    hiptensorContractionFind_t find;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
+
+    /**********************
+   * Query workspace
+   **********************/
+
+    uint64_t worksize = 0;
+    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
+        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
+
+    void* workspace = nullptr;
+
+    if(worksize > 0)
+    {
+        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
+    }
+
+    /**************************
+   * Create Contraction Plan
+   **************************/
+    std::cout << "Initializing contraction plan..." << std::endl;
+
+    hiptensorContractionPlan_t plan;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
+
+    std::cout << "Launching contraction kernel..." << std::endl;
+
+    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
+                                               &plan,
+                                               (void*)&alpha,
+                                               A_d,
+                                               B_d,
+                                               nullptr,
+                                               nullptr,
+                                               D_d,
+                                               workspace,
+                                               worksize,
+                                               0 /* stream */));
+
+    CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
+
+#if !NDEBUG
+    bool printElements = false;
+    bool storeElements = false;
+
+    if(printElements || storeElements)
+    {
+        CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
+    }
+
+    if(printElements)
+    {
+        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor A elements:\n";
+            hiptensorPrintArrayElements(std::cout, A, elementsA);
+            std::cout << std::endl;
+        }
+
+        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor B elements:\n";
+            hiptensorPrintArrayElements(std::cout, B, elementsB);
+            std::cout << std::endl;
+        }
+
+        if(elementsD < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor D elements:\n";
+            hiptensorPrintArrayElements(std::cout, D, elementsD);
+            std::cout << std::endl;
+        }
+    }
+
+    if(storeElements)
+    {
+        std::ofstream tensorA, tensorB, tensorD;
+        tensorA.open("tensor_A.txt");
+        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
+        tensorA.close();
+
+        tensorB.open("tensor_B.txt");
+        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
+        tensorB.close();
+
+        tensorD.open("tensor_D_scale_contraction_results.txt");
+        hiptensorPrintElementsToFile(tensorD, D, elementsD, ", ");
+        tensorD.close();
+    }
+
+#endif
+
+    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
+
+    HIPTENSOR_FREE_HOST(A);
+    HIPTENSOR_FREE_HOST(B);
+    HIPTENSOR_FREE_HOST(D);
+
+    HIPTENSOR_FREE_DEVICE(A_d);
+    HIPTENSOR_FREE_DEVICE(B_d);
+    HIPTENSOR_FREE_DEVICE(D_d);
+    HIPTENSOR_FREE_DEVICE(workspace);
+
+    std::cout << "Finished!" << std::endl;
+
+    return 0;
+}
diff --git a/samples/01_contraction/simple_scale_contraction_f16.cpp b/samples/01_contraction/simple_scale_contraction_f16.cpp
new file mode 100644
index 00000000..1e62be85
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_f16.cpp
@@ -0,0 +1,334 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include <algorithm>
+#include <fstream>
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
+#include <hiptensor/internal/hiptensor_utility.hpp>
+#include <iterator>
+#include <numeric>
+#include <unordered_map>
+
+#include "common.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    typedef _Float16 ADataType;
+    typedef _Float16 BDataType;
+    typedef _Float16 DDataType;
+    typedef float    floatTypeCompute;
+
+    hipDataType            typeA       = HIP_R_16F;
+    hipDataType            typeB       = HIP_R_16F;
+    hipDataType            typeD       = HIP_R_16F;
+    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    floatTypeCompute alpha = (floatTypeCompute)1.0f;
+
+    /**********************
+   * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v}
+   **********************/
+
+    std::vector<int> modeD{'m', 'n', 'u', 'v'};
+    std::vector<int> modeA{'m', 'n', 'h', 'k'};
+    std::vector<int> modeB{'u', 'v', 'h', 'k'};
+
+    int nmodeA = modeA.size();
+    int nmodeB = modeB.size();
+    int nmodeD = modeD.size();
+
+    std::unordered_map<int, int64_t> extent;
+
+    extent['m'] = 5;
+    extent['n'] = 6;
+    extent['u'] = 3;
+    extent['v'] = 4;
+    extent['h'] = 3;
+    extent['k'] = 4;
+
+    std::vector<int64_t> d_ms_ns_lengths;
+    for(auto mode : modeD)
+    {
+        d_ms_ns_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> a_ms_ks_lengths;
+    for(auto mode : modeA)
+    {
+        a_ms_ks_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> b_ns_ks_lengths;
+    for(auto mode : modeB)
+    {
+        b_ns_ks_lengths.push_back(extent[mode]);
+    }
+
+    hiptensorHandle_t* handle;
+    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
+
+    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
+
+    /********************************************
+   * Initialize tensors with the input lengths *
+   ********************************************/
+    hiptensorTensorDescriptor_t a_ms_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &a_ms_ks,
+                                                        nmodeA,
+                                                        a_ms_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeA,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t b_ns_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &b_ns_ks,
+                                                        nmodeB,
+                                                        b_ns_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeB,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t d_ms_ns;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &d_ms_ns,
+                                                        nmodeD,
+                                                        d_ms_ns_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeD,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    /**********************
+   * Allocating data
+   **********************/
+    std::cout << "Initializing host data..." << std::endl;
+
+    size_t elementsA = std::accumulate(
+        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsB = std::accumulate(
+        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsD = std::accumulate(
+        d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
+
+    size_t sizeA = sizeof(ADataType) * elementsA;
+    size_t sizeB = sizeof(BDataType) * elementsB;
+    size_t sizeD = sizeof(DDataType) * elementsD;
+
+    ADataType* A = nullptr;
+    BDataType* B = nullptr;
+    DDataType* D = nullptr;
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD));
+
+    void *A_d, *B_d, *D_d;
+
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&D_d), sizeD));
+
+    /*******************
+   * Initialize data
+   *******************/
+    for(int64_t i = 0; i < elementsA; i++)
+    {
+        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    for(int64_t i = 0; i < elementsB; i++)
+    {
+        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    for(int64_t i = 0; i < elementsD; i++)
+    {
+        D[i] = std::numeric_limits<DDataType>::signaling_NaN();
+    }
+
+    /********************************************
+   * Transfer the Host Tensor to Device Memory *
+   ********************************************/
+    std::cout << "Initializing device data..." << std::endl;
+
+    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD));
+
+    /************************************************
+   * Retrieve the memory alignment for each tensor
+   ************************************************/
+    uint32_t alignmentRequirementA;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
+
+    uint32_t alignmentRequirementB;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
+
+    uint32_t alignmentRequirementD;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD));
+
+    /*******************************
+   * Create Contraction Descriptor
+   *******************************/
+
+    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
+    std::cout << "d_ms_ns: " << d_ms_ns << std::endl;
+
+    hiptensorContractionDescriptor_t desc;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
+                                                             &desc,
+                                                             &a_ms_ks,
+                                                             modeA.data(),
+                                                             alignmentRequirementA,
+                                                             &b_ns_ks,
+                                                             modeB.data(),
+                                                             alignmentRequirementB,
+                                                             nullptr,
+                                                             nullptr,
+                                                             0,
+                                                             &d_ms_ns,
+                                                             modeD.data(),
+                                                             alignmentRequirementD,
+                                                             typeCompute));
+    /**************************
+   * Set the algorithm to use
+   ***************************/
+
+    hiptensorContractionFind_t find;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
+
+    /**********************
+   * Query workspace
+   **********************/
+
+    uint64_t worksize = 0;
+    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
+        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
+
+    void* workspace = nullptr;
+
+    if(worksize > 0)
+    {
+        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
+    }
+
+    /**************************
+   * Create Contraction Plan
+   **************************/
+    std::cout << "Initializing contraction plan..." << std::endl;
+
+    hiptensorContractionPlan_t plan;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
+
+    std::cout << "Launching contraction kernel..." << std::endl;
+
+    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
+                                               &plan,
+                                               (void*)&alpha,
+                                               A_d,
+                                               B_d,
+                                               nullptr,
+                                               nullptr,
+                                               D_d,
+                                               workspace,
+                                               worksize,
+                                               0 /* stream */));
+
+    CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
+
+#if !NDEBUG
+    bool printElements = false;
+    bool storeElements = false;
+
+    if(printElements || storeElements)
+    {
+        CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
+    }
+
+    if(printElements)
+    {
+        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor A elements:\n";
+            hiptensorPrintArrayElements(std::cout, A, elementsA);
+            std::cout << std::endl;
+        }
+
+        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor B elements:\n";
+            hiptensorPrintArrayElements(std::cout, B, elementsB);
+            std::cout << std::endl;
+        }
+
+        if(elementsD < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor D elements:\n";
+            hiptensorPrintArrayElements(std::cout, D, elementsD);
+            std::cout << std::endl;
+        }
+    }
+
+    if(storeElements)
+    {
+        std::ofstream tensorA, tensorB, tensorD;
+        tensorA.open("tensor_A.txt");
+        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
+        tensorA.close();
+
+        tensorB.open("tensor_B.txt");
+        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
+        tensorB.close();
+
+        tensorD.open("tensor_D_scale_contraction_results.txt");
+        hiptensorPrintElementsToFile(tensorD, D, elementsD, ", ");
+        tensorD.close();
+    }
+
+#endif
+
+    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
+
+    HIPTENSOR_FREE_HOST(A);
+    HIPTENSOR_FREE_HOST(B);
+    HIPTENSOR_FREE_HOST(D);
+
+    HIPTENSOR_FREE_DEVICE(A_d);
+    HIPTENSOR_FREE_DEVICE(B_d);
+    HIPTENSOR_FREE_DEVICE(D_d);
+    HIPTENSOR_FREE_DEVICE(workspace);
+
+    std::cout << "Finished!" << std::endl;
+
+    return 0;
+}
diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml
index 2bd90e90..a08065a0 100644
--- a/test/01_contraction/configs/bilinear_test_params.yaml
+++ b/test/01_contraction/configs/bilinear_test_params.yaml
@@ -1,6 +1,8 @@
 ---
 Log Level:       [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ]
 Tensor Data Types:
+  - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ]
+  - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ]
   - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F]
   - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F]
 Algorithm Types:
diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml
index 329f1b84..b28e9a88 100644
--- a/test/01_contraction/configs/scale_test_params.yaml
+++ b/test/01_contraction/configs/scale_test_params.yaml
@@ -1,6 +1,8 @@
 ---
 Log Level:       [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ]
 Tensor Data Types:
+  - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ]
+  - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ]
   - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ]
   - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ]
 Algorithm Types:
diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp
index 5d745d12..9446157f 100644
--- a/test/01_contraction/contraction_test.cpp
+++ b/test/01_contraction/contraction_test.cpp
@@ -56,7 +56,8 @@ namespace hiptensor
     // False = skip test
     bool ContractionTest::checkDevice(hipDataType datatype) const
     {
-        return (isF32Supported() && datatype == HIP_R_32F)
+        return (isF32Supported()
+                && (datatype == HIP_R_32F || datatype == HIP_R_16F || datatype == HIP_R_16BF))
                || (isF64Supported() && datatype == HIP_R_64F);
     }
 
@@ -115,11 +116,15 @@ namespace hiptensor
         auto CDataType = testType[2];
         auto DDataType = testType[3];
 
-        EXPECT_TRUE((ADataType == HIP_R_32F) || (ADataType == HIP_R_64F));
-        EXPECT_TRUE((BDataType == HIP_R_32F) || (BDataType == HIP_R_64F));
-        EXPECT_TRUE((CDataType == HIP_R_32F) || (CDataType == HIP_R_64F)
+        EXPECT_TRUE((ADataType == HIP_R_16F) || (ADataType == HIP_R_16BF)
+                    || (ADataType == HIP_R_32F) || (ADataType == HIP_R_64F));
+        EXPECT_TRUE((BDataType == HIP_R_16F) || (BDataType == HIP_R_16BF)
+                    || (BDataType == HIP_R_32F) || (BDataType == HIP_R_64F));
+        EXPECT_TRUE((CDataType == HIP_R_16F) || (CDataType == HIP_R_16BF)
+                    || (CDataType == HIP_R_32F) || (CDataType == HIP_R_64F)
                     || (CDataType == NONE_TYPE));
-        EXPECT_TRUE((DDataType == HIP_R_32F) || (DDataType == HIP_R_64F));
+        EXPECT_TRUE((DDataType == HIP_R_16F) || (DDataType == HIP_R_16BF)
+                    || (DDataType == HIP_R_32F) || (DDataType == HIP_R_64F));
 
         mRunFlag &= checkDevice(DDataType);
 
@@ -228,7 +233,35 @@ namespace hiptensor
             auto resource = getResource();
             resource->resizeStorage(lengths, elementBytes);
 
-            if(ADataType == HIP_R_32F && BDataType == HIP_R_32F && DDataType == HIP_R_32F)
+            if(ADataType == HIP_R_16F && BDataType == HIP_R_16F && DDataType == HIP_R_16F)
+            {
+                // Initialize matrix data on device
+                fillLaunchKernel<_Float16>((_Float16*)resource->deviceA().get(), elementsA);
+                fillLaunchKernel<_Float16>((_Float16*)resource->deviceB().get(), elementsB);
+                if(CDataType == HIP_R_16F)
+                {
+                    fillLaunchKernel<_Float16>((_Float16*)resource->deviceC().get(), elementsCD);
+                }
+                fillValLaunchKernel<_Float16>((_Float16*)resource->deviceD().get(),
+                                              elementsCD,
+                                              std::numeric_limits<_Float16>::signaling_NaN());
+            }
+            else if(ADataType == HIP_R_16BF && BDataType == HIP_R_16BF && DDataType == HIP_R_16BF)
+            {
+                // Initialize matrix data on device
+                fillLaunchKernel<hip_bfloat16>((hip_bfloat16*)resource->deviceA().get(), elementsA);
+                fillLaunchKernel<hip_bfloat16>((hip_bfloat16*)resource->deviceB().get(), elementsB);
+                if(CDataType == HIP_R_16BF)
+                {
+                    fillLaunchKernel<hip_bfloat16>((hip_bfloat16*)resource->deviceC().get(),
+                                                   elementsCD);
+                }
+                fillValLaunchKernel<hip_bfloat16>(
+                    (hip_bfloat16*)resource->deviceD().get(),
+                    elementsCD,
+                    std::numeric_limits<hip_bfloat16>::signaling_NaN());
+            }
+            else if(ADataType == HIP_R_32F && BDataType == HIP_R_32F && DDataType == HIP_R_32F)
             {
                 // Initialize matrix data on device
                 fillLaunchKernel<float>((float*)resource->deviceA().get(), elementsA);
@@ -328,7 +361,7 @@ namespace hiptensor
             {
                 auto resource = getResource();
 
-                int size = ((DDataType == HIP_R_32F) ? sizeof(float) : sizeof(double));
+                int size = hipDataTypeSize(DDataType);
 
                 size_t elementsA  = std::accumulate(a_ms_ks.mLengths.begin(),
                                                    a_ms_ks.mLengths.end(),
@@ -346,7 +379,50 @@ namespace hiptensor
                 auto D = resource->allocHost(elementsCD * size);
                 resource->copyData(D, resource->deviceD(), elementsCD * size);
 
-                if(DDataType == HIP_R_32F)
+                if(DDataType == HIP_R_16F)
+                {
+                    stream << "Tensor A elements:\n";
+                    hiptensorPrintArrayElements<_Float16>(
+                        stream, (_Float16*)resource->hostA().get(), elementsA);
+                    stream << std::endl;
+
+                    stream << "Tensor B elements:\n";
+                    hiptensorPrintArrayElements<_Float16>(
+                        stream, (_Float16*)resource->hostB().get(), elementsB);
+                    stream << std::endl;
+
+                    stream << "Tensor C elements:\n";
+                    hiptensorPrintArrayElements<_Float16>(
+                        stream, (_Float16*)resource->hostC().get(), elementsCD);
+                    stream << std::endl;
+
+                    stream << "Tensor D elements:\n";
+                    hiptensorPrintArrayElements<_Float16>(stream, (_Float16*)D.get(), elementsCD);
+                    stream << std::endl;
+                }
+                else if(DDataType == HIP_R_16BF)
+                {
+                    stream << "Tensor A elements:\n";
+                    hiptensorPrintArrayElements<hip_bfloat16>(
+                        stream, (hip_bfloat16*)resource->hostA().get(), elementsA);
+                    stream << std::endl;
+
+                    stream << "Tensor B elements:\n";
+                    hiptensorPrintArrayElements<hip_bfloat16>(
+                        stream, (hip_bfloat16*)resource->hostB().get(), elementsB);
+                    stream << std::endl;
+
+                    stream << "Tensor C elements:\n";
+                    hiptensorPrintArrayElements<hip_bfloat16>(
+                        stream, (hip_bfloat16*)resource->hostC().get(), elementsCD);
+                    stream << std::endl;
+
+                    stream << "Tensor D elements:\n";
+                    hiptensorPrintArrayElements<hip_bfloat16>(
+                        stream, (hip_bfloat16*)D.get(), elementsCD);
+                    stream << std::endl;
+                }
+                else if(DDataType == HIP_R_32F)
                 {
                     stream << "Tensor A elements:\n";
                     hiptensorPrintArrayElements<float>(
@@ -456,11 +532,24 @@ namespace hiptensor
                                                 size_t{1},
                                                 std::multiplies<size_t>());
 
-            int  sizeD = elementsCD * ((DDataType == HIP_R_32F) ? sizeof(float) : sizeof(double));
+            int  sizeD     = elementsCD * hipDataTypeSize(DDataType);
             auto reference = resource->allocDevice(sizeD);
             resource->copyData(reference, resource->hostD(), sizeD);
 
-            if(DDataType == HIP_R_32F)
+            if(DDataType == HIP_R_16F)
+            {
+                std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<_Float16>(
+                    (_Float16*)resource->deviceD().get(), (_Float16*)reference.get(), elementsCD);
+            }
+            else if(DDataType == HIP_R_16BF)
+            {
+                std::tie(mValidationResult, mMaxRelativeError)
+                    = compareEqualLaunchKernel<hip_bfloat16>(
+                        (hip_bfloat16*)resource->deviceD().get(),
+                        (hip_bfloat16*)reference.get(),
+                        elementsCD);
+            }
+            else if(DDataType == HIP_R_32F)
             {
                 std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<float>(
                     (float*)resource->deviceD().get(), (float*)reference.get(), elementsCD);
diff --git a/test/device/common.hpp b/test/device/common.hpp
index f961abc1..172e6953 100644
--- a/test/device/common.hpp
+++ b/test/device/common.hpp
@@ -72,7 +72,7 @@ __global__ void fillKernel(DataType* data, uint32_t elementSize, uint32_t seed)
 
     if(index < elementSize)
     {
-        auto value  = (DataType(index / DataType(RAND_MAX) - 0.5) * 100) / elementSize;
+        auto value  = (DataType(index / double(RAND_MAX) - 0.5) * 100) / elementSize;
         data[index] = static_cast<DataType>(value);
     }
 }
diff --git a/test/llvm/yaml_parser_config.cpp b/test/llvm/yaml_parser_config.cpp
index 46f4c43e..cd3eb46f 100644
--- a/test/llvm/yaml_parser_config.cpp
+++ b/test/llvm/yaml_parser_config.cpp
@@ -110,6 +110,7 @@ namespace llvm
             static void enumeration(IO& io, hipDataType& value)
             {
                 io.enumCase(value, "HIP_R_16F", HIP_R_16F);
+                io.enumCase(value, "HIP_R_16BF", HIP_R_16BF);
                 io.enumCase(value, "HIP_R_32F", HIP_R_32F);
                 io.enumCase(value, "HIP_R_64F", HIP_R_64F);
                 io.enumCase(value, "NONE_TYPE", hiptensor::NONE_TYPE);
diff --git a/test/utils.hpp b/test/utils.hpp
index 1f7ece44..ad4bb565 100644
--- a/test/utils.hpp
+++ b/test/utils.hpp
@@ -41,9 +41,9 @@
 #include <hiptensor/hiptensor.hpp>
 #include <hiptensor/hiptensor_types.hpp>
 #include <hiptensor/internal/hiptensor_utility.hpp>
+#include <hiptensor/internal/types.hpp>
 
 #include "device/common.hpp"
-#include "types.hpp"
 
 #define HIPTENSOR_FREE_DEVICE(ptr)     \
     if(ptr != nullptr)                 \

From 185a2ab115d4e6e8999917e349ca1a4d803e5228 Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Sat, 25 Nov 2023 02:38:48 +0000
Subject: [PATCH 02/12] Add support to f32_f16, f32_bf16, f64_f32 to
 contraction

- Support ABCD data type f32 and compute type f16, bf16
- Support ABCD data type f64 and compute type f32
- Fixed bug: alpha, beta were passed in as wrong data type in unit test
of contraction
- Create sample template of contraction
---
 .../contraction/contraction_cpu_reference.cpp |  48 +-
 .../contraction/contraction_cpu_reference.hpp |  39 +-
 .../contraction_cpu_reference_instances.cpp   | 104 ++-
 .../contraction/contraction_meta_traits.hpp   |  18 +-
 .../src/contraction/contraction_selection.cpp | 737 ++++++++++++++----
 .../src/contraction/contraction_selection.hpp |   9 +-
 .../contraction/contraction_solution_impl.hpp |   9 +-
 .../contraction_solution_instances.cpp        |  88 ++-
 .../contraction_solution_params.hpp           |   9 +-
 .../contraction_solution_params_impl.hpp      |   6 +
 .../contraction_solution_registry.cpp         |  81 +-
 .../contraction_solution_registry.hpp         |  60 +-
 library/src/contraction/device/CMakeLists.txt |  28 +-
 ...16_bf16_bf16_compute_f32_kknn_instance.cpp |  27 +-
 ...16_bf16_bf16_compute_f32_knnn_instance.cpp |  27 +-
 ...16_bf16_bf16_compute_f32_mknn_instance.cpp |  27 +-
 ...16_bf16_bf16_compute_f32_mnnn_instance.cpp |  27 +-
 ..._f16_f16_f16_compute_f32_kknn_instance.cpp |  27 +-
 ..._f16_f16_f16_compute_f32_knnn_instance.cpp |  27 +-
 ..._f16_f16_f16_compute_f32_mknn_instance.cpp |  27 +-
 ..._f16_f16_f16_compute_f32_mnnn_instance.cpp |  27 +-
 ...f32_f32_f32_compute_bf16_kknn_instance.cpp |  85 ++
 ...f32_f32_f32_compute_bf16_knnn_instance.cpp |  85 ++
 ...f32_f32_f32_compute_bf16_mknn_instance.cpp |  85 ++
 ...f32_f32_f32_compute_bf16_mnnn_instance.cpp |  85 ++
 ..._f32_f32_f32_compute_f16_kknn_instance.cpp |  85 ++
 ..._f32_f32_f32_compute_f16_knnn_instance.cpp |  85 ++
 ..._f32_f32_f32_compute_f16_mknn_instance.cpp |  85 ++
 ..._f32_f32_f32_compute_f16_mnnn_instance.cpp |  85 ++
 ..._f64_f64_f64_compute_f32_kknn_instance.cpp |  85 ++
 ..._f64_f64_f64_compute_f32_knnn_instance.cpp |  85 ++
 ..._f64_f64_f64_compute_f32_mknn_instance.cpp |  85 ++
 ..._f64_f64_f64_compute_f32_mnnn_instance.cpp |  85 ++
 ...f16_bf16_bf16_compute_f32_kkn_instance.cpp |  27 +-
 ...f16_bf16_bf16_compute_f32_knn_instance.cpp |  27 +-
 ...f16_bf16_bf16_compute_f32_mkn_instance.cpp |  27 +-
 ...f16_bf16_bf16_compute_f32_mnn_instance.cpp |  27 +-
 ...e_f16_f16_f16_compute_f32_kkn_instance.cpp |  27 +-
 ...e_f16_f16_f16_compute_f32_knn_instance.cpp |  27 +-
 ...e_f16_f16_f16_compute_f32_mkn_instance.cpp |  27 +-
 ...e_f16_f16_f16_compute_f32_mnn_instance.cpp |  27 +-
 ..._f32_f32_f32_compute_bf16_kkn_instance.cpp |  85 ++
 ..._f32_f32_f32_compute_bf16_knn_instance.cpp |  85 ++
 ..._f32_f32_f32_compute_bf16_mkn_instance.cpp |  85 ++
 ..._f32_f32_f32_compute_bf16_mnn_instance.cpp |  85 ++
 ...e_f32_f32_f32_compute_f16_kkn_instance.cpp |  85 ++
 ...e_f32_f32_f32_compute_f16_knn_instance.cpp |  85 ++
 ...e_f32_f32_f32_compute_f16_mkn_instance.cpp |  85 ++
 ...e_f32_f32_f32_compute_f16_mnn_instance.cpp |  85 ++
 ...e_f64_f64_f64_compute_f32_kkn_instance.cpp |  62 ++
 ...e_f64_f64_f64_compute_f32_knn_instance.cpp |  62 ++
 ...e_f64_f64_f64_compute_f32_mkn_instance.cpp |  62 ++
 ...e_f64_f64_f64_compute_f32_mnn_instance.cpp |  62 ++
 .../src/contraction/hiptensor_contraction.cpp |  59 +-
 library/src/data_types.cpp                    |  43 +
 library/src/include/data_types.hpp            |   2 +
 samples/01_contraction/CMakeLists.txt         |  31 +
 .../simple_bilinear_contraction.hpp           | 351 +++++++++
 .../simple_bilinear_contraction_bf16.cpp      | 313 +-------
 .../simple_bilinear_contraction_f16.cpp       | 313 +-------
 .../simple_bilinear_contraction_f32.cpp       | 313 +-------
 .../simple_bilinear_contraction_f32_bf16.cpp  |  57 ++
 .../simple_bilinear_contraction_f32_f16.cpp   |  57 ++
 .../simple_bilinear_contraction_f64.cpp       |  57 ++
 .../simple_bilinear_contraction_f64_f32.cpp   |  57 ++
 .../simple_scale_contraction.hpp              | 341 ++++++++
 .../simple_scale_contraction_bf16.cpp         | 311 +-------
 .../simple_scale_contraction_f16.cpp          | 317 +-------
 .../simple_scale_contraction_f32.cpp          | 310 +-------
 .../simple_scale_contraction_f32_bf16.cpp     |  58 ++
 .../simple_scale_contraction_f32_f16.cpp      |  58 ++
 .../simple_scale_contraction_f64.cpp          |  57 ++
 .../simple_scale_contraction_f64_f32.cpp      |  57 ++
 .../configs/bilinear_test_params.yaml         |   7 +-
 .../configs/scale_test_params.yaml            |   3 +
 test/01_contraction/contraction_test.cpp      |  18 +-
 76 files changed, 4657 insertions(+), 2149 deletions(-)
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
 create mode 100644 samples/01_contraction/simple_bilinear_contraction.hpp
 create mode 100644 samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp
 create mode 100644 samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp
 create mode 100644 samples/01_contraction/simple_bilinear_contraction_f64.cpp
 create mode 100644 samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp
 create mode 100644 samples/01_contraction/simple_scale_contraction.hpp
 create mode 100644 samples/01_contraction/simple_scale_contraction_f32_bf16.cpp
 create mode 100644 samples/01_contraction/simple_scale_contraction_f32_f16.cpp
 create mode 100644 samples/01_contraction/simple_scale_contraction_f64.cpp
 create mode 100644 samples/01_contraction/simple_scale_contraction_f64_f32.cpp

diff --git a/library/src/contraction/contraction_cpu_reference.cpp b/library/src/contraction/contraction_cpu_reference.cpp
index 13dcdffd..ac1d9711 100644
--- a/library/src/contraction/contraction_cpu_reference.cpp
+++ b/library/src/contraction/contraction_cpu_reference.cpp
@@ -28,31 +28,33 @@
 #include "contraction_cpu_reference_impl.hpp"
 #include "contraction_cpu_reference_instances.hpp"
 
-hiptensorStatus_t hiptensorContractionReference(void const*                alpha,
-                                                void const*                A,
-                                                void const*                B,
-                                                void const*                beta,
-                                                void const*                C,
-                                                void*                      D,
-                                                std::vector<size_t> const& a_ms_ks_lengths,
-                                                std::vector<size_t> const& a_ms_ks_strides,
-                                                std::vector<size_t> const& b_ns_ks_lengths,
-                                                std::vector<size_t> const& b_ns_ks_strides,
-                                                std::vector<size_t> const& c_ms_ns_lengths,
-                                                std::vector<size_t> const& c_ms_ns_strides,
-                                                std::vector<size_t> const& d_ms_ns_lengths,
-                                                std::vector<size_t> const& d_ms_ns_strides,
-                                                hipDataType                typeA,
-                                                hipDataType                typeB,
-                                                hipDataType                typeC,
-                                                hipDataType                typeD,
-                                                void*                      workspace)
+hiptensorStatus_t hiptensorContractionReference(const hiptensorContractionPlan_t* plan,
+                                                void const*                       alpha,
+                                                void const*                       A,
+                                                void const*                       B,
+                                                void const*                       beta,
+                                                void const*                       C,
+                                                void*                             D,
+                                                std::vector<size_t> const&        a_ms_ks_lengths,
+                                                std::vector<size_t> const&        a_ms_ks_strides,
+                                                std::vector<size_t> const&        b_ns_ks_lengths,
+                                                std::vector<size_t> const&        b_ns_ks_strides,
+                                                std::vector<size_t> const&        c_ms_ns_lengths,
+                                                std::vector<size_t> const&        c_ms_ns_strides,
+                                                std::vector<size_t> const&        d_ms_ns_lengths,
+                                                std::vector<size_t> const&        d_ms_ns_strides,
+                                                hipDataType                       typeA,
+                                                hipDataType                       typeB,
+                                                hipDataType                       typeC,
+                                                hipDataType                       typeD,
+                                                void*                             workspace)
 {
-    auto& instances = hiptensor::ContractionCpuReferenceInstances::instance();
+    auto& instances   = hiptensor::ContractionCpuReferenceInstances::instance();
+    auto  computeType = plan->mContractionDesc.mComputeType;
     auto  candidates
-        = (C == nullptr)
-              ? instances->allSolutions().query(typeA, typeB, hiptensor::NONE_TYPE, typeD)
-              : instances->allSolutions().query(typeA, typeB, typeC, typeD);
+        = (C == nullptr) ? instances->allSolutions().query(
+              typeA, typeB, hiptensor::NONE_TYPE, typeD, computeType)
+                         : instances->allSolutions().query(typeA, typeB, typeC, typeD, computeType);
 
     auto toCKVec
         = [](auto& inputVec) { return std::vector<ck::index_t>(inputVec.begin(), inputVec.end()); };
diff --git a/library/src/contraction/contraction_cpu_reference.hpp b/library/src/contraction/contraction_cpu_reference.hpp
index aadb062e..471026dc 100644
--- a/library/src/contraction/contraction_cpu_reference.hpp
+++ b/library/src/contraction/contraction_cpu_reference.hpp
@@ -32,24 +32,25 @@
 
 #include <hiptensor/hiptensor.hpp>
 
-hiptensorStatus_t hiptensorContractionReference(void const*                alpha,
-                                                void const*                A,
-                                                void const*                B,
-                                                void const*                beta,
-                                                void const*                C,
-                                                void*                      D,
-                                                std::vector<size_t> const& a_ms_ks_lengths,
-                                                std::vector<size_t> const& a_ms_ks_strides,
-                                                std::vector<size_t> const& b_ks_ns_lengths,
-                                                std::vector<size_t> const& b_ks_ns_strides,
-                                                std::vector<size_t> const& c_ms_ns_lengths,
-                                                std::vector<size_t> const& c_ms_ns_strides,
-                                                std::vector<size_t> const& d_ms_ns_lengths,
-                                                std::vector<size_t> const& d_ms_ns_strides,
-                                                hipDataType                typeA,
-                                                hipDataType                typeB,
-                                                hipDataType                typeC,
-                                                hipDataType                typeD,
-                                                void*                      workspace);
+hiptensorStatus_t hiptensorContractionReference(const hiptensorContractionPlan_t* plan,
+                                                void const*                       alpha,
+                                                void const*                       A,
+                                                void const*                       B,
+                                                void const*                       beta,
+                                                void const*                       C,
+                                                void*                             D,
+                                                std::vector<size_t> const&        a_ms_ks_lengths,
+                                                std::vector<size_t> const&        a_ms_ks_strides,
+                                                std::vector<size_t> const&        b_ks_ns_lengths,
+                                                std::vector<size_t> const&        b_ks_ns_strides,
+                                                std::vector<size_t> const&        c_ms_ns_lengths,
+                                                std::vector<size_t> const&        c_ms_ns_strides,
+                                                std::vector<size_t> const&        d_ms_ns_lengths,
+                                                std::vector<size_t> const&        d_ms_ns_strides,
+                                                hipDataType                       typeA,
+                                                hipDataType                       typeB,
+                                                hipDataType                       typeC,
+                                                hipDataType                       typeD,
+                                                void*                             workspace);
 
 #endif // HIPTENSOR_CONTRACTION_CPU_REFERENCE_HPP
diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp
index 146d2721..173a49e9 100644
--- a/library/src/contraction/contraction_cpu_reference_instances.cpp
+++ b/library/src/contraction/contraction_cpu_reference_instances.cpp
@@ -37,10 +37,10 @@ namespace hiptensor
             enumerateReferenceSolutions<2,
                                         2,
                                         2,
-                                        _Float16,
-                                        _Float16,
-                                        ck::Tuple<_Float16>,
-                                        _Float16,
+                                        ck::half_t,
+                                        ck::half_t,
+                                        ck::Tuple<ck::half_t>,
+                                        ck::half_t,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::Bilinear,
@@ -71,7 +71,34 @@ namespace hiptensor
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Bilinear>());
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        ck::half_t>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        float,
+                                        float,
+                                        ck::Tuple<float>,
+                                        float,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        ck::bhalf_t>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        float,
+                                        float,
+                                        ck::Tuple<float>,
+                                        float,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        float>());
 
         // Bilinear f64
         registerSolutions(
@@ -84,17 +111,31 @@ namespace hiptensor
                                         double,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Bilinear>());
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        float>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        double,
+                                        double,
+                                        ck::Tuple<double>,
+                                        double,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        double>());
 
         // Scale f16
         registerSolutions(
             enumerateReferenceSolutions<2,
                                         2,
                                         2,
-                                        _Float16,
-                                        _Float16,
+                                        ck::half_t,
+                                        ck::half_t,
                                         ck::Tuple<>,
-                                        _Float16,
+                                        ck::half_t,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::Scale,
@@ -125,7 +166,34 @@ namespace hiptensor
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Scale>());
+                                        ck::tensor_operation::element_wise::Scale,
+                                        ck::half_t>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        float,
+                                        float,
+                                        ck::Tuple<>,
+                                        float,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        ck::bhalf_t>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        float,
+                                        float,
+                                        ck::Tuple<>,
+                                        float,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        float>());
 
         // Scale f64
         registerSolutions(
@@ -138,6 +206,20 @@ namespace hiptensor
                                         double,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Scale>());
+                                        ck::tensor_operation::element_wise::Scale,
+                                        float>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        double,
+                                        double,
+                                        ck::Tuple<>,
+                                        double,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        double>());
     }
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp
index ab158f96..6a7cb35f 100644
--- a/library/src/contraction/contraction_meta_traits.hpp
+++ b/library/src/contraction/contraction_meta_traits.hpp
@@ -75,9 +75,12 @@ namespace hiptensor
             = std::conditional_t<std::is_same_v<DsDataType, ck::bhalf_t>, hip_bfloat16, DsDataType>;
         using EDataT
             = std::conditional_t<std::is_same_v<EDataType, ck::bhalf_t>, hip_bfloat16, EDataType>;
-        using AOp   = AElementwiseOperation;
-        using BOp   = BElementwiseOperation;
-        using CDEOp = ck::tensor_operation::element_wise::Bilinear;
+        using ComputeDataT = std::conditional_t<std::is_same_v<ComputeDataType, ck::bhalf_t>,
+                                                hip_bfloat16,
+                                                ComputeDataType>;
+        using AOp          = AElementwiseOperation;
+        using BOp          = BElementwiseOperation;
+        using CDEOp        = ck::tensor_operation::element_wise::Bilinear;
     };
 
     // Partial specialize for Scale contraction
@@ -113,9 +116,12 @@ namespace hiptensor
         using DDataT = NoneType;
         using EDataT
             = std::conditional_t<std::is_same_v<EDataType, ck::bhalf_t>, hip_bfloat16, EDataType>;
-        using AOp   = AElementwiseOperation;
-        using BOp   = BElementwiseOperation;
-        using CDEOp = ck::tensor_operation::element_wise::Scale;
+        using ComputeDataT = std::conditional_t<std::is_same_v<ComputeDataType, ck::bhalf_t>,
+                                                hip_bfloat16,
+                                                ComputeDataType>;
+        using AOp          = AElementwiseOperation;
+        using BOp          = BElementwiseOperation;
+        using CDEOp        = ck::tensor_operation::element_wise::Scale;
     };
 
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp
index 1b2cf92e..888ef4c1 100644
--- a/library/src/contraction/contraction_selection.cpp
+++ b/library/src/contraction/contraction_selection.cpp
@@ -54,6 +54,7 @@ namespace hiptensor
                                       hipDataType                              typeE,
                                       std::vector<std::size_t> const&          e_ms_ns_lengths,
                                       std::vector<std::size_t> const&          e_ms_ns_strides,
+                                      hiptensorComputeType_t                   computeType,
                                       const uint64_t                           workspaceSize)
     {
         // Make sure that we calculate full element space incase strides are not packed.
@@ -70,9 +71,11 @@ namespace hiptensor
         auto sizeE = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides)
                      * hipDataTypeSize(typeE);
 
-        void *A_d, *B_d, *D_d, *E_d, *wspace;
-        float alpha = 1.02f;
-        float beta  = 1.03f;
+        void * A_d, *B_d, *D_d, *E_d, *wspace;
+        double alpha = 0.0d;
+        double beta  = 0.0d;
+        writeVal(&alpha, computeType, 1.02);
+        writeVal(&beta, computeType, 1.03);
 
         CHECK_HIP_ALLOC(hipMalloc(&A_d, sizeA));
         CHECK_HIP_ALLOC(hipMalloc(&B_d, sizeB));
@@ -150,9 +153,13 @@ namespace hiptensor
         }
     }
 
-    // test
     template <>
-    struct ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, ContractionOpId_t::SCALE>
+    struct ActorCriticSelection<_Float16,
+                                _Float16,
+                                _Float16,
+                                _Float16,
+                                ContractionOpId_t::SCALE,
+                                float>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -196,7 +203,12 @@ namespace hiptensor
     };
 
     template <>
-    struct ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, ContractionOpId_t::BILINEAR>
+    struct ActorCriticSelection<_Float16,
+                                _Float16,
+                                _Float16,
+                                _Float16,
+                                ContractionOpId_t::BILINEAR,
+                                float>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -224,7 +236,7 @@ namespace hiptensor
 
             size_t unique_id = 0;
 
-            // select unique_id
+            // TODO select unique_id
             unique_id = 7255639152084218514;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
@@ -244,7 +256,8 @@ namespace hiptensor
                                 hip_bfloat16,
                                 hip_bfloat16,
                                 hip_bfloat16,
-                                ContractionOpId_t::SCALE>
+                                ContractionOpId_t::SCALE,
+                                float>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -292,7 +305,8 @@ namespace hiptensor
                                 hip_bfloat16,
                                 hip_bfloat16,
                                 hip_bfloat16,
-                                ContractionOpId_t::BILINEAR>
+                                ContractionOpId_t::BILINEAR,
+                                float>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -320,7 +334,7 @@ namespace hiptensor
 
             size_t unique_id = 0;
 
-            // select unique_id
+            // TODO select unique_id
             unique_id = 8689089455041651212;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
@@ -334,10 +348,183 @@ namespace hiptensor
             }
         }
     };
-    // end test
 
     template <>
-    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE>
+    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE, _Float16>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            // TODO select unique_id
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::BILINEAR, _Float16>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+            // TODO select unique_id
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE, hip_bfloat16>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+            // TODO select unique_id
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<float,
+                                float,
+                                float,
+                                float,
+                                ContractionOpId_t::BILINEAR,
+                                hip_bfloat16>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+            // TODO select unique_id
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE, float>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -702,7 +889,7 @@ namespace hiptensor
     };
 
     template <>
-    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::BILINEAR>
+    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::BILINEAR, float>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -1060,7 +1247,92 @@ namespace hiptensor
     };
 
     template <>
-    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::SCALE>
+    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::SCALE, float>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+            // TODO select unique_id
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::BILINEAR, float>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+            // TODO select unique_id
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::SCALE, double>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -1335,7 +1607,7 @@ namespace hiptensor
     };
 
     template <>
-    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::BILINEAR>
+    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::BILINEAR, double>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -1602,181 +1874,344 @@ namespace hiptensor
                          hipDataType                                             typeE,
                          std::vector<std::size_t> const&                         e_ms_ns_lengths,
                          std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         hiptensorComputeType_t                                  computeType,
                          const uint64_t                                          workspaceSize)
     {
-        if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == NONE_TYPE && typeE == HIP_R_16F)
+        if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == NONE_TYPE && typeE == HIP_R_16F
+           && computeType == HIP_R_32F)
         {
             return ActorCriticSelection<_Float16,
                                         _Float16,
                                         _Float16,
                                         _Float16,
-                                        ContractionOpId_t::SCALE>::selectWinner(winner,
-                                                                                candidates,
-                                                                                typeA,
-                                                                                a_ms_ks_lengths,
-                                                                                a_ms_ks_strides,
-                                                                                typeB,
-                                                                                b_ns_ks_lengths,
-                                                                                b_ns_ks_strides,
-                                                                                typeD,
-                                                                                d_ms_ns_lengths,
-                                                                                d_ms_ns_strides,
-                                                                                typeE,
-                                                                                e_ms_ns_lengths,
-                                                                                e_ms_ns_strides,
-                                                                                workspaceSize);
+                                        ContractionOpId_t::SCALE,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
         }
-        else if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == HIP_R_16F
-                && typeE == HIP_R_16F)
+        else if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == HIP_R_16F && typeE == HIP_R_16F
+                && computeType == HIP_R_32F)
         {
             return ActorCriticSelection<_Float16,
                                         _Float16,
                                         _Float16,
                                         _Float16,
-                                        ContractionOpId_t::BILINEAR>::selectWinner(winner,
-                                                                                   candidates,
-                                                                                   typeA,
-                                                                                   a_ms_ks_lengths,
-                                                                                   a_ms_ks_strides,
-                                                                                   typeB,
-                                                                                   b_ns_ks_lengths,
-                                                                                   b_ns_ks_strides,
-                                                                                   typeD,
-                                                                                   d_ms_ns_lengths,
-                                                                                   d_ms_ns_strides,
-                                                                                   typeE,
-                                                                                   e_ms_ns_lengths,
-                                                                                   e_ms_ns_strides,
-                                                                                   workspaceSize);
+                                        ContractionOpId_t::BILINEAR,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
         }
         else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == NONE_TYPE
-                && typeE == HIP_R_16BF)
+                && typeE == HIP_R_16BF && computeType == HIP_R_32F)
         {
             return ActorCriticSelection<hip_bfloat16,
                                         hip_bfloat16,
                                         hip_bfloat16,
                                         hip_bfloat16,
-                                        ContractionOpId_t::SCALE>::selectWinner(winner,
-                                                                                candidates,
-                                                                                typeA,
-                                                                                a_ms_ks_lengths,
-                                                                                a_ms_ks_strides,
-                                                                                typeB,
-                                                                                b_ns_ks_lengths,
-                                                                                b_ns_ks_strides,
-                                                                                typeD,
-                                                                                d_ms_ns_lengths,
-                                                                                d_ms_ns_strides,
-                                                                                typeE,
-                                                                                e_ms_ns_lengths,
-                                                                                e_ms_ns_strides,
-                                                                                workspaceSize);
+                                        ContractionOpId_t::SCALE,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
         }
         else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == HIP_R_16BF
-                && typeE == HIP_R_16BF)
+                && typeE == HIP_R_16BF && computeType == HIP_R_32F)
         {
             return ActorCriticSelection<hip_bfloat16,
                                         hip_bfloat16,
                                         hip_bfloat16,
                                         hip_bfloat16,
-                                        ContractionOpId_t::BILINEAR>::selectWinner(winner,
-                                                                                   candidates,
-                                                                                   typeA,
-                                                                                   a_ms_ks_lengths,
-                                                                                   a_ms_ks_strides,
-                                                                                   typeB,
-                                                                                   b_ns_ks_lengths,
-                                                                                   b_ns_ks_strides,
-                                                                                   typeD,
-                                                                                   d_ms_ns_lengths,
-                                                                                   d_ms_ns_strides,
-                                                                                   typeE,
-                                                                                   e_ms_ns_lengths,
-                                                                                   e_ms_ns_strides,
-                                                                                   workspaceSize);
+                                        ContractionOpId_t::BILINEAR,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
+        }
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F
+                && computeType == HIP_R_16F)
+        {
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::SCALE,
+                                        _Float16>::selectWinner(winner,
+                                                                candidates,
+                                                                typeA,
+                                                                a_ms_ks_lengths,
+                                                                a_ms_ks_strides,
+                                                                typeB,
+                                                                b_ns_ks_lengths,
+                                                                b_ns_ks_strides,
+                                                                typeD,
+                                                                d_ms_ns_lengths,
+                                                                d_ms_ns_strides,
+                                                                typeE,
+                                                                e_ms_ns_lengths,
+                                                                e_ms_ns_strides,
+                                                                workspaceSize);
+        }
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F
+                && computeType == HIP_R_16F)
+        {
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::BILINEAR,
+                                        _Float16>::selectWinner(winner,
+                                                                candidates,
+                                                                typeA,
+                                                                a_ms_ks_lengths,
+                                                                a_ms_ks_strides,
+                                                                typeB,
+                                                                b_ns_ks_lengths,
+                                                                b_ns_ks_strides,
+                                                                typeD,
+                                                                d_ms_ns_lengths,
+                                                                d_ms_ns_strides,
+                                                                typeE,
+                                                                e_ms_ns_lengths,
+                                                                e_ms_ns_strides,
+                                                                workspaceSize);
+        }
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F
+                && computeType == HIP_R_16BF)
+        {
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::SCALE,
+                                        hip_bfloat16>::selectWinner(winner,
+                                                                    candidates,
+                                                                    typeA,
+                                                                    a_ms_ks_lengths,
+                                                                    a_ms_ks_strides,
+                                                                    typeB,
+                                                                    b_ns_ks_lengths,
+                                                                    b_ns_ks_strides,
+                                                                    typeD,
+                                                                    d_ms_ns_lengths,
+                                                                    d_ms_ns_strides,
+                                                                    typeE,
+                                                                    e_ms_ns_lengths,
+                                                                    e_ms_ns_strides,
+                                                                    workspaceSize);
+        }
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F
+                && computeType == HIP_R_16BF)
+        {
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::BILINEAR,
+                                        hip_bfloat16>::selectWinner(winner,
+                                                                    candidates,
+                                                                    typeA,
+                                                                    a_ms_ks_lengths,
+                                                                    a_ms_ks_strides,
+                                                                    typeB,
+                                                                    b_ns_ks_lengths,
+                                                                    b_ns_ks_strides,
+                                                                    typeD,
+                                                                    d_ms_ns_lengths,
+                                                                    d_ms_ns_strides,
+                                                                    typeE,
+                                                                    e_ms_ns_lengths,
+                                                                    e_ms_ns_strides,
+                                                                    workspaceSize);
         }
-        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE
-                && typeE == HIP_R_32F)
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F
+                && computeType == HIP_R_32F)
         {
-            return ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE>::
-                selectWinner(winner,
-                             candidates,
-                             typeA,
-                             a_ms_ks_lengths,
-                             a_ms_ks_strides,
-                             typeB,
-                             b_ns_ks_lengths,
-                             b_ns_ks_strides,
-                             typeD,
-                             d_ms_ns_lengths,
-                             d_ms_ns_strides,
-                             typeE,
-                             e_ms_ns_lengths,
-                             e_ms_ns_strides,
-                             workspaceSize);
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::SCALE,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
         }
-        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F
-                && typeE == HIP_R_32F)
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F
+                && computeType == HIP_R_32F)
         {
-            return ActorCriticSelection<float, float, float, float, ContractionOpId_t::BILINEAR>::
-                selectWinner(winner,
-                             candidates,
-                             typeA,
-                             a_ms_ks_lengths,
-                             a_ms_ks_strides,
-                             typeB,
-                             b_ns_ks_lengths,
-                             b_ns_ks_strides,
-                             typeD,
-                             d_ms_ns_lengths,
-                             d_ms_ns_strides,
-                             typeE,
-                             e_ms_ns_lengths,
-                             e_ms_ns_strides,
-                             workspaceSize);
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::BILINEAR,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
         }
-        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE
-                && typeE == HIP_R_64F)
+        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE && typeE == HIP_R_64F
+                && computeType == HIP_R_32F)
         {
-            return ActorCriticSelection<double, double, double, double, ContractionOpId_t::SCALE>::
-                selectWinner(winner,
-                             candidates,
-                             typeA,
-                             a_ms_ks_lengths,
-                             a_ms_ks_strides,
-                             typeB,
-                             b_ns_ks_lengths,
-                             b_ns_ks_strides,
-                             typeD,
-                             d_ms_ns_lengths,
-                             d_ms_ns_strides,
-                             typeE,
-                             e_ms_ns_lengths,
-                             e_ms_ns_strides,
-                             workspaceSize);
+            return ActorCriticSelection<double,
+                                        double,
+                                        double,
+                                        double,
+                                        ContractionOpId_t::SCALE,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
+        }
+        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F && typeE == HIP_R_64F
+                && computeType == HIP_R_32F)
+        {
+            return ActorCriticSelection<double,
+                                        double,
+                                        double,
+                                        double,
+                                        ContractionOpId_t::BILINEAR,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
+        }
+        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE && typeE == HIP_R_64F
+                && computeType == HIP_R_64F)
+        {
+            return ActorCriticSelection<double,
+                                        double,
+                                        double,
+                                        double,
+                                        ContractionOpId_t::SCALE,
+                                        double>::selectWinner(winner,
+                                                              candidates,
+                                                              typeA,
+                                                              a_ms_ks_lengths,
+                                                              a_ms_ks_strides,
+                                                              typeB,
+                                                              b_ns_ks_lengths,
+                                                              b_ns_ks_strides,
+                                                              typeD,
+                                                              d_ms_ns_lengths,
+                                                              d_ms_ns_strides,
+                                                              typeE,
+                                                              e_ms_ns_lengths,
+                                                              e_ms_ns_strides,
+                                                              workspaceSize);
         }
-        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F
-                && typeE == HIP_R_64F)
+        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F && typeE == HIP_R_64F
+                && computeType == HIP_R_64F)
         {
             return ActorCriticSelection<double,
                                         double,
                                         double,
                                         double,
-                                        ContractionOpId_t::BILINEAR>::selectWinner(winner,
-                                                                                   candidates,
-                                                                                   typeA,
-                                                                                   a_ms_ks_lengths,
-                                                                                   a_ms_ks_strides,
-                                                                                   typeB,
-                                                                                   b_ns_ks_lengths,
-                                                                                   b_ns_ks_strides,
-                                                                                   typeD,
-                                                                                   d_ms_ns_lengths,
-                                                                                   d_ms_ns_strides,
-                                                                                   typeE,
-                                                                                   e_ms_ns_lengths,
-                                                                                   e_ms_ns_strides,
-                                                                                   workspaceSize);
+                                        ContractionOpId_t::BILINEAR,
+                                        double>::selectWinner(winner,
+                                                              candidates,
+                                                              typeA,
+                                                              a_ms_ks_lengths,
+                                                              a_ms_ks_strides,
+                                                              typeB,
+                                                              b_ns_ks_lengths,
+                                                              b_ns_ks_strides,
+                                                              typeD,
+                                                              d_ms_ns_lengths,
+                                                              d_ms_ns_strides,
+                                                              typeE,
+                                                              e_ms_ns_lengths,
+                                                              e_ms_ns_strides,
+                                                              workspaceSize);
         }
         return HIPTENSOR_STATUS_EXECUTION_FAILED;
     }
diff --git a/library/src/contraction/contraction_selection.hpp b/library/src/contraction/contraction_selection.hpp
index 9ceb6a14..deb980d9 100644
--- a/library/src/contraction/contraction_selection.hpp
+++ b/library/src/contraction/contraction_selection.hpp
@@ -49,9 +49,15 @@ namespace hiptensor
                                       hipDataType                              typeE,
                                       std::vector<std::size_t> const&          e_ms_ns_lengths,
                                       std::vector<std::size_t> const&          e_ms_ns_strides,
+                                      hiptensorComputeType_t                   computeType,
                                       const uint64_t                           workspaceSize);
 
-    template <typename A, typename B, typename C, typename D, ContractionOpId_t ContractionOp>
+    template <typename A,
+              typename B,
+              typename C,
+              typename D,
+              ContractionOpId_t ContractionOp,
+              typename ComputeType>
     struct ActorCriticSelection
     {
         static hiptensorStatus_t
@@ -87,6 +93,7 @@ namespace hiptensor
                          hipDataType                                             typeE,
                          std::vector<std::size_t> const&                         e_ms_ns_lengths,
                          std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         hiptensorComputeType_t                                  computeType,
                          const uint64_t                                          workspaceSize);
 
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_solution_impl.hpp b/library/src/contraction/contraction_solution_impl.hpp
index 5e191441..3b672fbb 100644
--- a/library/src/contraction/contraction_solution_impl.hpp
+++ b/library/src/contraction/contraction_solution_impl.hpp
@@ -95,11 +95,13 @@ namespace hiptensor
 
             if(alpha != nullptr)
             {
-                alphaF = hiptensor::readVal<float>(alpha, HipDataType_v<typename Traits::EDataT>);
+                alphaF = hiptensor::readVal<float>(
+                    alpha, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
             }
             if(beta != nullptr)
             {
-                betaF = hiptensor::readVal<float>(beta, HipDataType_v<typename Traits::EDataT>);
+                betaF = hiptensor::readVal<float>(
+                    beta, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
             }
 
             // CK has its own format for indices...
@@ -205,7 +207,8 @@ namespace hiptensor
 
             if(alpha != nullptr)
             {
-                alphaF = hiptensor::readVal<float>(alpha, HipDataType_v<typename Traits::EDataT>);
+                alphaF = hiptensor::readVal<float>(
+                    alpha, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
             }
 
             // CK has its own format for indices...
diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp
index 6d481577..aec12e32 100644
--- a/library/src/contraction/contraction_solution_instances.cpp
+++ b/library/src/contraction/contraction_solution_instances.cpp
@@ -72,7 +72,34 @@ namespace hiptensor
                                           float,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Bilinear>());
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          float>());
+
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          float,
+                                          float,
+                                          ck::Tuple<float>,
+                                          float,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          ck::half_t>());
+
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          float,
+                                          float,
+                                          ck::Tuple<float>,
+                                          float,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          ck::bhalf_t>());
 
         // Bilinear f64
         registerSolutions(
@@ -85,7 +112,20 @@ namespace hiptensor
                                           double,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Bilinear>());
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          float>());
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          double,
+                                          double,
+                                          ck::Tuple<double>,
+                                          double,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          double>());
 
         // Scale bf16
         registerSolutions(
@@ -126,8 +166,34 @@ namespace hiptensor
                                           float,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Scale>());
+                                          ck::tensor_operation::element_wise::Scale,
+                                          float>());
+
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          float,
+                                          float,
+                                          ck::Tuple<>,
+                                          float,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Scale,
+                                          ck::half_t>());
 
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          float,
+                                          float,
+                                          ck::Tuple<>,
+                                          float,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Scale,
+                                          ck::bhalf_t>());
         // Scale f64
         registerSolutions(
             enumerateContractionSolutions<2,
@@ -139,6 +205,20 @@ namespace hiptensor
                                           double,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Scale>());
+                                          ck::tensor_operation::element_wise::Scale,
+                                          float>());
+
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          double,
+                                          double,
+                                          ck::Tuple<>,
+                                          double,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Scale,
+                                          double>());
     }
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_solution_params.hpp b/library/src/contraction/contraction_solution_params.hpp
index ec9de45c..4c44de88 100644
--- a/library/src/contraction/contraction_solution_params.hpp
+++ b/library/src/contraction/contraction_solution_params.hpp
@@ -49,10 +49,11 @@ namespace hiptensor
         virtual int32_t dimsK() const = 0;
 
         // Map to hipDataType
-        virtual hipDataType typeA() const = 0;
-        virtual hipDataType typeB() const = 0;
-        virtual hipDataType typeC() const = 0;
-        virtual hipDataType typeD() const = 0;
+        virtual hipDataType            typeA() const       = 0;
+        virtual hipDataType            typeB() const       = 0;
+        virtual hipDataType            typeC() const       = 0;
+        virtual hipDataType            typeD() const       = 0;
+        virtual hiptensorComputeType_t typeCompute() const = 0;
 
         // Map to operators
         virtual hiptensorOperator_t opA() const   = 0;
diff --git a/library/src/contraction/contraction_solution_params_impl.hpp b/library/src/contraction/contraction_solution_params_impl.hpp
index bff33960..b84f9c2b 100644
--- a/library/src/contraction/contraction_solution_params_impl.hpp
+++ b/library/src/contraction/contraction_solution_params_impl.hpp
@@ -42,6 +42,7 @@ namespace std
             return hiptensor::Hash{}(s.dimsM(),
                                      s.dimsN(),
                                      s.dimsK(),
+                                     s.typeCompute(),
                                      s.typeA(),
                                      s.typeB(),
                                      s.typeC(),
@@ -102,6 +103,11 @@ namespace hiptensor
             return HipDataType_v<typename MetaTraitsT::EDataT>;
         }
 
+        hiptensorComputeType_t typeCompute() const override
+        {
+            return convertToComputeType(HipDataType_v<typename MetaTraitsT::ComputeDataT>);
+        }
+
         hiptensorOperator_t opA() const override
         {
             return ElementWiseOperatorType_v<typename MetaTraitsT::AOp>;
diff --git a/library/src/contraction/contraction_solution_registry.cpp b/library/src/contraction/contraction_solution_registry.cpp
index 83674c81..9e2da1f9 100644
--- a/library/src/contraction/contraction_solution_registry.cpp
+++ b/library/src/contraction/contraction_solution_registry.cpp
@@ -53,19 +53,20 @@ namespace hiptensor
     }
 
     ContractionSolutionRegistry::Query
-        ContractionSolutionRegistry::Query::query(int32_t             dimsM,
-                                                  int32_t             dimsN,
-                                                  int32_t             dimsK,
-                                                  hipDataType         typeA,
-                                                  hipDataType         typeB,
-                                                  hipDataType         typeC,
-                                                  hipDataType         typeD,
-                                                  hiptensorOperator_t opA,
-                                                  hiptensorOperator_t opB,
-                                                  ContractionOpId_t   opCDE) const
+        ContractionSolutionRegistry::Query::query(int32_t                dimsM,
+                                                  int32_t                dimsN,
+                                                  int32_t                dimsK,
+                                                  hipDataType            typeA,
+                                                  hipDataType            typeB,
+                                                  hipDataType            typeC,
+                                                  hipDataType            typeD,
+                                                  hiptensorOperator_t    opA,
+                                                  hiptensorOperator_t    opB,
+                                                  ContractionOpId_t      opCDE,
+                                                  hiptensorComputeType_t typeCompute) const
     {
-        auto solutionHash
-            = hashSolution(dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE);
+        auto solutionHash = hashSolution(
+            dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE, typeCompute);
 
         if(auto solutions = mSolutionHash.find(solutionHash); solutions != mSolutionHash.end())
         {
@@ -81,10 +82,14 @@ namespace hiptensor
         return query(hashDimsMNK(dimsM, dimsN, dimsK));
     }
 
-    ContractionSolutionRegistry::Query ContractionSolutionRegistry::Query::query(
-        hipDataType typeA, hipDataType typeB, hipDataType typeC, hipDataType typeD) const
+    ContractionSolutionRegistry::Query
+        ContractionSolutionRegistry::Query::query(hipDataType            typeA,
+                                                  hipDataType            typeB,
+                                                  hipDataType            typeC,
+                                                  hipDataType            typeD,
+                                                  hiptensorComputeType_t typeCompute) const
     {
-        return query(hashTypesABCD(typeA, typeB, typeC, typeD));
+        return query(hashTypesComputeABCD(typeA, typeB, typeC, typeD, typeCompute));
     }
 
     ContractionSolutionRegistry::Query
@@ -159,18 +164,20 @@ namespace hiptensor
 
     /* static */
     ContractionSolutionRegistry::Query::HashId
-        ContractionSolutionRegistry::Query::hashSolution(int32_t             dimsM,
-                                                         int32_t             dimsN,
-                                                         int32_t             dimsK,
-                                                         hipDataType         typeA,
-                                                         hipDataType         typeB,
-                                                         hipDataType         typeC,
-                                                         hipDataType         typeD,
-                                                         hiptensorOperator_t opA,
-                                                         hiptensorOperator_t opB,
-                                                         ContractionOpId_t   opCDE)
+        ContractionSolutionRegistry::Query::hashSolution(int32_t                dimsM,
+                                                         int32_t                dimsN,
+                                                         int32_t                dimsK,
+                                                         hipDataType            typeA,
+                                                         hipDataType            typeB,
+                                                         hipDataType            typeC,
+                                                         hipDataType            typeD,
+                                                         hiptensorOperator_t    opA,
+                                                         hiptensorOperator_t    opB,
+                                                         ContractionOpId_t      opCDE,
+                                                         hiptensorComputeType_t typeCompute)
     {
-        return Hash{}(dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE);
+        return Hash{}(
+            dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE, typeCompute);
     }
 
     /* static */
@@ -181,10 +188,14 @@ namespace hiptensor
     }
 
     /* static */
-    ContractionSolutionRegistry::Query::HashId ContractionSolutionRegistry::Query::hashTypesABCD(
-        hipDataType typeA, hipDataType typeB, hipDataType typeC, hipDataType typeD)
+    ContractionSolutionRegistry::Query::HashId
+        ContractionSolutionRegistry::Query::hashTypesComputeABCD(hipDataType            typeA,
+                                                                 hipDataType            typeB,
+                                                                 hipDataType            typeC,
+                                                                 hipDataType            typeD,
+                                                                 hiptensorComputeType_t typeCompute)
     {
-        return Hash{}(typeA, typeB, typeC, typeD);
+        return Hash{}(typeA, typeB, typeC, typeD, typeCompute);
     }
 
     /* static */
@@ -220,12 +231,16 @@ namespace hiptensor
                                              params->typeD(),
                                              params->opA(),
                                              params->opB(),
-                                             params->opCDE());
+                                             params->opCDE(),
+                                             params->typeCompute());
 
             auto dimsMNKHash = hashDimsMNK(params->dimsM(), params->dimsN(), params->dimsK());
 
-            auto typesABCDHash
-                = hashTypesABCD(params->typeA(), params->typeB(), params->typeC(), params->typeD());
+            auto typesComputeABCDHash = hashTypesComputeABCD(params->typeA(),
+                                                             params->typeB(),
+                                                             params->typeC(),
+                                                             params->typeD(),
+                                                             params->typeCompute());
 
             auto elementOpsHash = hashElementOps(params->opA(), params->opB());
 
@@ -236,7 +251,7 @@ namespace hiptensor
             mAllSolutions[solutionUid] = solution;
             mSolutionHash[solutionHash].push_back(solution);
             mSolutionHash[dimsMNKHash].push_back(solution);
-            mSolutionHash[typesABCDHash].push_back(solution);
+            mSolutionHash[typesComputeABCDHash].push_back(solution);
             mSolutionHash[elementOpsHash].push_back(solution);
             mSolutionHash[contactionOpsHash].push_back(solution);
         }
diff --git a/library/src/contraction/contraction_solution_registry.hpp b/library/src/contraction/contraction_solution_registry.hpp
index d1b80ec5..44aaa97d 100644
--- a/library/src/contraction/contraction_solution_registry.hpp
+++ b/library/src/contraction/contraction_solution_registry.hpp
@@ -59,25 +59,27 @@ namespace hiptensor
             /// E.g. in this context, query further parameters.
 
             // By full solution type
-            Query query(int32_t             dimsM,
-                        int32_t             dimsN,
-                        int32_t             dimsK,
-                        hipDataType         typeA,
-                        hipDataType         typeB,
-                        hipDataType         typeC,
-                        hipDataType         typeD,
-                        hiptensorOperator_t opA,
-                        hiptensorOperator_t opB,
-                        ContractionOpId_t   opCDE) const;
+            Query query(int32_t                dimsM,
+                        int32_t                dimsN,
+                        int32_t                dimsK,
+                        hipDataType            typeA,
+                        hipDataType            typeB,
+                        hipDataType            typeC,
+                        hipDataType            typeD,
+                        hiptensorOperator_t    opA,
+                        hiptensorOperator_t    opB,
+                        ContractionOpId_t      opCDE,
+                        hiptensorComputeType_t typeCompute) const;
 
             // By dimensions
             Query query(int32_t dimsM, int32_t dimsN, int32_t dimsK) const;
 
             // By data types
-            Query query(hipDataType typeA,
-                        hipDataType typeB,
-                        hipDataType typeC,
-                        hipDataType typeD) const;
+            Query query(hipDataType            typeA,
+                        hipDataType            typeB,
+                        hipDataType            typeC,
+                        hipDataType            typeD,
+                        hiptensorComputeType_t typeCompute) const;
 
             // By element-wise operations
             Query query(hiptensorOperator_t opA, hiptensorOperator_t opB) const;
@@ -104,22 +106,24 @@ namespace hiptensor
             Query query(HashId queryHash) const;
 
             // Hashing helpers
-            static HashId hashSolution(int32_t             dimsM,
-                                       int32_t             dimsN,
-                                       int32_t             dimsK,
-                                       hipDataType         typeA,
-                                       hipDataType         typeB,
-                                       hipDataType         typeC,
-                                       hipDataType         typeD,
-                                       hiptensorOperator_t opA,
-                                       hiptensorOperator_t opB,
-                                       ContractionOpId_t   opCDE);
+            static HashId hashSolution(int32_t                dimsM,
+                                       int32_t                dimsN,
+                                       int32_t                dimsK,
+                                       hipDataType            typeA,
+                                       hipDataType            typeB,
+                                       hipDataType            typeC,
+                                       hipDataType            typeD,
+                                       hiptensorOperator_t    opA,
+                                       hiptensorOperator_t    opB,
+                                       ContractionOpId_t      opCDE,
+                                       hiptensorComputeType_t typeCompute);
 
             static HashId hashDimsMNK(int32_t dimsM, int32_t dimsN, int32_t dimsK);
-            static HashId hashTypesABCD(hipDataType typeA,
-                                        hipDataType typeB,
-                                        hipDataType typeC,
-                                        hipDataType typeD);
+            static HashId hashTypesComputeABCD(hipDataType            typeA,
+                                               hipDataType            typeB,
+                                               hipDataType            typeC,
+                                               hipDataType            typeD,
+                                               hiptensorComputeType_t typeCompute);
             static HashId hashElementOps(hiptensorOperator_t opA, hiptensorOperator_t opB);
             static HashId hashContractionOps(ContractionOpId_t opCDE);
 
diff --git a/library/src/contraction/device/CMakeLists.txt b/library/src/contraction/device/CMakeLists.txt
index b9b382c0..eacac5b1 100644
--- a/library/src/contraction/device/CMakeLists.txt
+++ b/library/src/contraction/device/CMakeLists.txt
@@ -33,10 +33,22 @@
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
@@ -49,15 +61,27 @@
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
      )
 
- add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES})
- target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES})
+add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES})
+target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES})
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
index 7d777a83..3b3f6d47 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
index a9a97148..fd43f0ad 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
index d83d8d16..21fb8127 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
index bc49c82b..cc975c03 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
index a9d963ab..ff670630 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
index c139942e..be8bfe84 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
index 3c6ced30..4be69898 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
index 33c66296..2f6d630b 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
new file mode 100644
index 00000000..cc21216c
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
new file mode 100644
index 00000000..57c47457
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
new file mode 100644
index 00000000..a121fbb3
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
new file mode 100644
index 00000000..7962da9f
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
new file mode 100644
index 00000000..ea2be147
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
new file mode 100644
index 00000000..d82ea442
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
new file mode 100644
index 00000000..772df2e3
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
new file mode 100644
index 00000000..8b1d0681
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
new file mode 100644
index 00000000..ad5ce461
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance
+                    = device_contraction_f64_kk_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               F64_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
new file mode 100644
index 00000000..ae3ee856
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance
+                    = device_contraction_f64_kn_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               F64_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
new file mode 100644
index 00000000..b72005ad
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance
+                    = device_contraction_f64_mk_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               F64_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
new file mode 100644
index 00000000..b94030e5
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance
+                    = device_contraction_f64_mn_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               F64_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
index 05400151..1da8301f 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
index bba95b14..82c17500 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
index fb5ecec0..1febb560 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
index 1dd6613c..02b9d719 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
index e98aee20..5917e466 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
index db8de1c0..216f470e 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
index 397ef327..3401b605 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
index 1f9221dc..fe2fa97d 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
new file mode 100644
index 00000000..9a104075
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
new file mode 100644
index 00000000..6a7f565f
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
new file mode 100644
index 00000000..094655bb
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
new file mode 100644
index 00000000..583b5b00
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
new file mode 100644
index 00000000..8eec79cf
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
new file mode 100644
index 00000000..a8999be8
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
new file mode 100644
index 00000000..e4e4b7de
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
new file mode 100644
index 00000000..a641f6e3
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
new file mode 100644
index 00000000..04176d80
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance
+                    = device_contraction_f64_kk_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               Empty_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
new file mode 100644
index 00000000..06481fc7
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance
+                    = device_contraction_f64_kn_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               Empty_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
new file mode 100644
index 00000000..94922008
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance
+                    = device_contraction_f64_mk_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               Empty_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
new file mode 100644
index 00000000..e70b854b
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance
+                    = device_contraction_f64_mn_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               Empty_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp
index b96a204e..c7b7501b 100644
--- a/library/src/contraction/hiptensor_contraction.cpp
+++ b/library/src/contraction/hiptensor_contraction.cpp
@@ -242,17 +242,6 @@ hiptensorStatus_t hiptensorInitContractionFind(const hiptensorHandle_t*    handl
         auto& instances = hiptensor::ContractionSolutionInstances::instance();
         auto  solnQ     = instances->allSolutions();
 
-        // Check if the current device supports F64
-        if(!currentDevice.supportsF64())
-        {
-            // Allow only supported f32 combos
-            solnQ = solnQ.query(HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F) || // Bilinear F32
-                    solnQ.query(HIP_R_32F,
-                                HIP_R_32F,
-                                hipDataType(hiptensor::NONE_TYPE),
-                                HIP_R_32F); // Scale F32 (no C)
-        }
-
         // Can do more checking for scale / bilinear, etc. if we need to.
 
         if(solnQ.solutionCount() == 0)
@@ -461,15 +450,16 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t*
     // Convert to concrete contraction solutions
     auto candidates = toContractionSolutionVec(find->mCandidates);
 
-    auto ADataType = desc->mTensorDesc[0].mType;
-    auto BDataType = desc->mTensorDesc[1].mType;
-    auto DDataType = desc->mTensorDesc[2].mType;
-    auto EDataType = desc->mTensorDesc[3].mType;
+    auto computeType = desc->mComputeType;
+    auto ADataType   = desc->mTensorDesc[0].mType;
+    auto BDataType   = desc->mTensorDesc[1].mType;
+    auto DDataType   = desc->mTensorDesc[2].mType;
+    auto EDataType   = desc->mTensorDesc[3].mType;
 
     // Query contraction solutions for the correct contraction operation and type
     auto solutionQ = hiptensor::ContractionSolutionRegistry::Query{candidates}
                          .query((hiptensor::ContractionOpId_t)desc->mContractionOpId)
-                         .query(ADataType, BDataType, DDataType, EDataType);
+                         .query(ADataType, BDataType, DDataType, EDataType, computeType);
 
     candidates = toContractionSolutionVec(solutionQ.solutions());
 
@@ -500,6 +490,7 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t*
                                             EDataType,
                                             desc->mTensorDesc[3].mLengths,
                                             desc->mTensorDesc[3].mStrides,
+                                            desc->mComputeType,
                                             workspaceSize);
     }
     else if(find->mSelectionAlgorithm == HIPTENSOR_ALGO_ACTOR_CRITIC)
@@ -518,6 +509,7 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t*
                                              EDataType,
                                              desc->mTensorDesc[3].mLengths,
                                              desc->mTensorDesc[3].mStrides,
+                                             desc->mComputeType,
                                              workspaceSize);
     }
 
@@ -582,18 +574,9 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         }
         else
         {
-            if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_32F)
-            {
-                snprintf(
-                    alphaMsg, sizeof(alphaMsg), "alpha=%.6f", *(static_cast<const float*>(alpha)));
-            }
-            else if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_64F)
-            {
-                snprintf(alphaMsg,
-                         sizeof(alphaMsg),
-                         "alpha=%.6lf",
-                         *(static_cast<const double*>(alpha)));
-            }
+            auto alphaValue
+                = hiptensor::readVal<double>(alpha, plan->mContractionDesc.mComputeType);
+            snprintf(alphaMsg, sizeof(alphaMsg), "alpha=%.6lf", alphaValue);
         }
 
         if(beta == nullptr)
@@ -602,15 +585,8 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         }
         else
         {
-            if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_32F)
-            {
-                snprintf(betaMsg, sizeof(betaMsg), "beta=%.6f", *(static_cast<const float*>(beta)));
-            }
-            else if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_64F)
-            {
-                snprintf(
-                    betaMsg, sizeof(betaMsg), "beta=%.6lf", *(static_cast<const double*>(beta)));
-            }
+            auto betaValue = hiptensor::readVal<double>(beta, plan->mContractionDesc.mComputeType);
+            snprintf(betaMsg, sizeof(betaMsg), "beta=%.6lf", betaValue);
         }
     }
     else
@@ -745,6 +721,10 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         if(logger->getLogMask() & HIPTENSOR_LOG_LEVEL_PERF_TRACE)
         {
             auto time = (*cSolution)(StreamConfig{stream, true});
+            if(time < 0)
+            {
+                return HIPTENSOR_STATUS_CK_ERROR;
+            }
 
             int32_t m, n, k;
             std::tie(m, n, k) = cSolution->problemDims();
@@ -773,7 +753,10 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         // Perform contraction without timing
         else
         {
-            (*cSolution)(StreamConfig{stream, false});
+            if((*cSolution)(StreamConfig{stream, false}) < 0)
+            {
+                return HIPTENSOR_STATUS_CK_ERROR;
+            }
         }
 
         return HIPTENSOR_STATUS_SUCCESS;
diff --git a/library/src/data_types.cpp b/library/src/data_types.cpp
index b270973d..38e9f186 100644
--- a/library/src/data_types.cpp
+++ b/library/src/data_types.cpp
@@ -132,6 +132,49 @@ namespace hiptensor
         }
     }
 
+    void writeVal(void const* addr, hiptensorComputeType_t id, double value)
+    {
+        if(id == HIPTENSOR_COMPUTE_16F)
+        {
+            *(_Float16*)addr = value;
+        }
+        else if(id == HIPTENSOR_COMPUTE_16BF)
+        {
+            *(hip_bfloat16*)addr = value;
+        }
+        else if(id == HIPTENSOR_COMPUTE_32F)
+        {
+            *(float*)addr = value;
+        }
+        else if(id == HIPTENSOR_COMPUTE_64F)
+        {
+            *(double*)addr = value;
+        }
+        else if(id == HIPTENSOR_COMPUTE_8U)
+        {
+            *(uint8_t*)addr = value;
+        }
+        else if(id == HIPTENSOR_COMPUTE_8I)
+        {
+            *(int8_t*)addr = value;
+        }
+        else if(id == HIPTENSOR_COMPUTE_32U)
+        {
+            *(uint32_t*)addr = value;
+        }
+        else if(id == HIPTENSOR_COMPUTE_32I)
+        {
+            *(int32_t*)addr = value;
+        }
+        else
+        {
+#if !NDEBUG
+            std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl;
+#endif // !NDEBUG
+            return;
+        }
+    }
+
 } // namespace hiptensor
 
 bool operator==(hipDataType hipType, hiptensorComputeType_t computeType)
diff --git a/library/src/include/data_types.hpp b/library/src/include/data_types.hpp
index 42197650..19ccca6c 100644
--- a/library/src/include/data_types.hpp
+++ b/library/src/include/data_types.hpp
@@ -65,6 +65,8 @@ namespace hiptensor
     template <typename T>
     T readVal(void const* value, hiptensorComputeType_t id);
 
+    void writeVal(void const* addr, hiptensorComputeType_t id, double value);
+
 } // namespace hiptensor
 
 bool operator==(hipDataType hipType, hiptensorComputeType_t computeType);
diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt
index 15972d60..de834d72 100644
--- a/samples/01_contraction/CMakeLists.txt
+++ b/samples/01_contraction/CMakeLists.txt
@@ -29,9 +29,17 @@ if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
     add_hiptensor_sample(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp)
     add_hiptensor_sample(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp)
     add_hiptensor_sample(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp)
+    add_hiptensor_sample(simple_contraction_scale_f64 simple_scale_contraction_f64.cpp)
+    add_hiptensor_sample(simple_contraction_scale_f64_f32 simple_scale_contraction_f64_f32.cpp)
+    add_hiptensor_sample(simple_contraction_scale_f32_f16 simple_scale_contraction_f32_f16.cpp)
+    add_hiptensor_sample(simple_contraction_scale_f32_bf16 simple_scale_contraction_f32_bf16.cpp)
     add_hiptensor_sample(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp)
     add_hiptensor_sample(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp)
     add_hiptensor_sample(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp)
+    add_hiptensor_sample(simple_contraction_bilinear_f64 simple_bilinear_contraction_f64.cpp)
+    add_hiptensor_sample(simple_contraction_bilinear_f64_f32 simple_bilinear_contraction_f64_f32.cpp)
+    add_hiptensor_sample(simple_contraction_bilinear_f32_f16 simple_bilinear_contraction_f32_f16.cpp)
+    add_hiptensor_sample(simple_contraction_bilinear_f32_bf16 simple_bilinear_contraction_f32_bf16.cpp)
 
 # If building hipTensor samples as a standalone Cmake project
 else()
@@ -44,6 +52,18 @@ else()
     add_executable(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp)
     target_link_libraries(simple_contraction_scale_f32 PRIVATE hiptensor::hiptensor)
 
+    add_executable(simple_contraction_scale_f32_bf16 simple_scale_contraction_f32_bf16.cpp)
+    target_link_libraries(simple_contraction_scale_f32_bf16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_contraction_scale_f32_f16 simple_scale_contraction_f32_f16.cpp)
+    target_link_libraries(simple_contraction_scale_f32_f16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_contraction_scale_f64 simple_scale_contraction_f64.cpp)
+    target_link_libraries(simple_contraction_scale_f64 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_contraction_scale_f64_f32 simple_scale_contraction_f64_f32.cpp)
+    target_link_libraries(simple_contraction_scale_f64_f32 PRIVATE hiptensor::hiptensor)
+
     add_executable(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp)
     target_link_libraries(simple_contraction_bilinear_f16 PRIVATE hiptensor::hiptensor)
 
@@ -53,4 +73,15 @@ else()
     add_executable(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp)
     target_link_libraries(simple_contraction_bilinear_f32 PRIVATE hiptensor::hiptensor)
 
+    add_executable(simple_contraction_bilinear_f32_bf16 simple_bilinear_contraction_f32_bf16.cpp)
+    target_link_libraries(simple_contraction_bilinear_f32_bf16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_contraction_bilinear_f32_f16 simple_bilinear_contraction_f32_f16.cpp)
+    target_link_libraries(simple_contraction_bilinear_f32_f16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_contraction_bilinear_f64 simple_bilinear_contraction_f64.cpp)
+    target_link_libraries(simple_contraction_bilinear_f64 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_contraction_bilinear_f64_f32 simple_bilinear_contraction_f64_f32.cpp)
+    target_link_libraries(simple_contraction_bilinear_f64_f32 PRIVATE hiptensor::hiptensor)
 endif()
diff --git a/samples/01_contraction/simple_bilinear_contraction.hpp b/samples/01_contraction/simple_bilinear_contraction.hpp
new file mode 100644
index 00000000..aaef4a1b
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction.hpp
@@ -0,0 +1,351 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include <algorithm>
+#include <fstream>
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
+#include <hiptensor/internal/hiptensor_utility.hpp>
+#include <iterator>
+#include <numeric>
+#include <unordered_map>
+
+#include "common.hpp"
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename floatTypeCompute,
+          hipDataType            typeA,
+          hipDataType            typeB,
+          hipDataType            typeC,
+          hiptensorComputeType_t typeCompute>
+int bilinearContractionSample()
+{
+    floatTypeCompute alpha = (floatTypeCompute)1.0f;
+    floatTypeCompute beta  = (floatTypeCompute)1.0f;
+
+    /**********************
+   * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta *
+   *C_{m,n,u,v}
+   **********************/
+
+    std::vector<int> modeC{'m', 'n', 'u', 'v'};
+    std::vector<int> modeA{'m', 'n', 'h', 'k'};
+    std::vector<int> modeB{'u', 'v', 'h', 'k'};
+
+    int nmodeA = modeA.size();
+    int nmodeB = modeB.size();
+    int nmodeC = modeC.size();
+
+    std::unordered_map<int, int64_t> extent;
+
+    extent['m'] = 4;
+    extent['n'] = 3;
+    extent['u'] = 4;
+    extent['v'] = 3;
+    extent['h'] = 6;
+    extent['k'] = 5;
+
+    std::vector<int64_t> c_ms_ns_lengths;
+    for(auto mode : modeC)
+    {
+        c_ms_ns_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> a_ms_ks_lengths;
+    for(auto mode : modeA)
+    {
+        a_ms_ks_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> b_ns_ks_lengths;
+    for(auto mode : modeB)
+    {
+        b_ns_ks_lengths.push_back(extent[mode]);
+    }
+
+    hiptensorHandle_t* handle;
+    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
+
+    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
+
+    /********************************************
+   * Initialize tensors with the input lengths *
+   ********************************************/
+    hiptensorTensorDescriptor_t a_ms_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &a_ms_ks,
+                                                        nmodeA,
+                                                        a_ms_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeA,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t b_ns_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &b_ns_ks,
+                                                        nmodeB,
+                                                        b_ns_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeB,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t c_ms_ns;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &c_ms_ns,
+                                                        nmodeC,
+                                                        c_ms_ns_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeC,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    /**********************
+   * Allocating data
+   **********************/
+    std::cout << "Initializing host data..." << std::endl;
+
+    size_t elementsA = std::accumulate(
+        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsB = std::accumulate(
+        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsC = std::accumulate(
+        c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
+
+    size_t sizeA = sizeof(ADataType) * elementsA;
+    size_t sizeB = sizeof(BDataType) * elementsB;
+    size_t sizeC = sizeof(CDataType) * elementsC;
+
+    ADataType* A = nullptr;
+    BDataType* B = nullptr;
+    CDataType* C = nullptr;
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC));
+
+    void *A_d, *B_d, *C_d;
+
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&C_d), sizeC));
+
+    /*******************
+   * Initialize data
+   *******************/
+    int initMethod = 0; // TODO read value from commandline
+    for(int64_t i = 0; i < elementsA; i++)
+    {
+        if(initMethod == 0)
+        {
+            A[i] = ADataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100;
+        }
+        else
+        {
+            A[i] = (ADataType)(float(i) / 100);
+        }
+    }
+
+    for(int64_t i = 0; i < elementsB; i++)
+    {
+        if(initMethod == 0)
+        {
+            B[i] = BDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100;
+        }
+        else
+        {
+            B[i] = (BDataType)(float(i) / 100);
+        }
+    }
+
+    for(int64_t i = 0; i < elementsC; i++)
+    {
+        if(initMethod == 0)
+        {
+            C[i] = CDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100;
+        }
+        else
+        {
+            C[i] = (BDataType)(float(i) / 100);
+        }
+    }
+
+    /********************************************
+   * Transfer the Host Tensor to Device Memory *
+   ********************************************/
+    std::cout << "Initializing device data..." << std::endl;
+
+    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast<const void*>(C), sizeC, hipMemcpyHostToDevice));
+
+    /************************************************
+   * Retrieve the memory alignment for each tensor
+   ************************************************/
+    uint32_t alignmentRequirementA;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
+
+    uint32_t alignmentRequirementB;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
+
+    uint32_t alignmentRequirementC;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC));
+
+    /*******************************
+   * Create Contraction Descriptor
+   *******************************/
+
+    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
+    std::cout << "c_ms_ns: " << c_ms_ns << std::endl;
+
+    hiptensorContractionDescriptor_t desc;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
+                                                             &desc,
+                                                             &a_ms_ks,
+                                                             modeA.data(),
+                                                             alignmentRequirementA,
+                                                             &b_ns_ks,
+                                                             modeB.data(),
+                                                             alignmentRequirementB,
+                                                             &c_ms_ns,
+                                                             modeC.data(),
+                                                             alignmentRequirementC,
+                                                             &c_ms_ns,
+                                                             modeC.data(),
+                                                             alignmentRequirementC,
+                                                             typeCompute));
+    /**************************
+   * Set the algorithm to use
+   ***************************/
+
+    hiptensorContractionFind_t find;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
+
+    /**********************
+   * Query workspace
+   **********************/
+
+    uint64_t worksize = 0;
+    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
+        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
+
+    void* workspace = nullptr;
+
+    if(worksize > 0)
+    {
+        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
+    }
+
+    /**************************
+   * Create Contraction Plan
+   **************************/
+    std::cout << "Initializing contraction plan..." << std::endl;
+
+    hiptensorContractionPlan_t plan;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
+
+    std::cout << "Launching contraction kernel..." << std::endl;
+
+    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
+                                               &plan,
+                                               (void*)&alpha,
+                                               A_d,
+                                               B_d,
+                                               (void*)&beta,
+                                               C_d,
+                                               C_d,
+                                               workspace,
+                                               worksize,
+                                               0 /* stream */));
+
+#if !NDEBUG
+    bool printElements = false;
+    bool storeElements = false;
+
+    if(printElements || storeElements)
+    {
+        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
+    }
+
+    if(printElements)
+    {
+        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor A elements:\n";
+            hiptensorPrintArrayElements(std::cout, A, elementsA);
+            std::cout << std::endl;
+        }
+
+        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor B elements:\n";
+            hiptensorPrintArrayElements(std::cout, B, elementsB);
+            std::cout << std::endl;
+        }
+
+        if(elementsC < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor C elements:\n";
+            hiptensorPrintArrayElements(std::cout, C, elementsC);
+            std::cout << std::endl;
+        }
+    }
+
+    if(storeElements)
+    {
+        std::ofstream tensorA, tensorB, tensorC;
+        tensorA.open("tensor_A.txt");
+        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
+        tensorA.close();
+
+        tensorB.open("tensor_B.txt");
+        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
+        tensorB.close();
+
+        tensorC.open("tensor_C_scale_contraction_results.txt");
+        hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
+        tensorC.close();
+    }
+
+#endif
+
+    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
+
+    HIPTENSOR_FREE_HOST(A);
+    HIPTENSOR_FREE_HOST(B);
+    HIPTENSOR_FREE_HOST(C);
+
+    HIPTENSOR_FREE_DEVICE(A_d);
+    HIPTENSOR_FREE_DEVICE(B_d);
+    HIPTENSOR_FREE_DEVICE(C_d);
+    HIPTENSOR_FREE_DEVICE(workspace);
+
+    std::cout << "Finished!" << std::endl;
+
+    return 0;
+}
diff --git a/samples/01_contraction/simple_bilinear_contraction_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_bf16.cpp
index 0a4a9314..f6714a2f 100644
--- a/samples/01_contraction/simple_bilinear_contraction_bf16.cpp
+++ b/samples/01_contraction/simple_bilinear_contraction_bf16.cpp
@@ -23,17 +23,7 @@
  * THE SOFTWARE.
  *
  *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <hiptensor/hiptensor.hpp>
-#include <hiptensor/hiptensor_types.hpp>
-#include <hiptensor/internal/hiptensor_utility.hpp>
-#include <iostream>
-#include <iterator>
-#include <numeric>
-#include <unordered_map>
-
-#include "common.hpp"
+#include "simple_bilinear_contraction.hpp"
 
 int main(int argc, char* argv[])
 {
@@ -51,292 +41,17 @@ int main(int argc, char* argv[])
     typedef hip_bfloat16 CDataType;
     typedef float        floatTypeCompute;
 
-    hipDataType            typeA       = HIP_R_16BF;
-    hipDataType            typeB       = HIP_R_16BF;
-    hipDataType            typeC       = HIP_R_16BF;
-    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
-
-    floatTypeCompute alpha = (floatTypeCompute)1.1f;
-    floatTypeCompute beta  = (floatTypeCompute)1.0f;
-
-    /**********************
-   * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta *
-   *C_{m,n,u,v}
-   **********************/
-
-    std::vector<int> modeC{'m', 'n', 'u', 'v'};
-    std::vector<int> modeA{'m', 'n', 'h', 'k'};
-    std::vector<int> modeB{'u', 'v', 'h', 'k'};
-
-    int nmodeA = modeA.size();
-    int nmodeB = modeB.size();
-    int nmodeC = modeC.size();
-
-    std::unordered_map<int, int64_t> extent;
-
-    extent['m'] = 5;
-    extent['n'] = 6;
-    extent['u'] = 3;
-    extent['v'] = 4;
-    extent['h'] = 3;
-    extent['k'] = 4;
-
-    std::vector<int64_t> c_ms_ns_lengths;
-    for(auto mode : modeC)
-    {
-        c_ms_ns_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> a_ms_ks_lengths;
-    for(auto mode : modeA)
-    {
-        a_ms_ks_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> b_ns_ks_lengths;
-    for(auto mode : modeB)
-    {
-        b_ns_ks_lengths.push_back(extent[mode]);
-    }
-
-    hiptensorHandle_t* handle;
-    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
-
-    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
-
-    /********************************************
-   * Initialize tensors with the input lengths *
-   ********************************************/
-    hiptensorTensorDescriptor_t a_ms_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &a_ms_ks,
-                                                        nmodeA,
-                                                        a_ms_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeA,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t b_ns_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &b_ns_ks,
-                                                        nmodeB,
-                                                        b_ns_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeB,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t c_ms_ns;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &c_ms_ns,
-                                                        nmodeC,
-                                                        c_ms_ns_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeC,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    /**********************
-   * Allocating data
-   **********************/
-    std::cout << "Initializing host data..." << std::endl;
-
-    size_t elementsA = std::accumulate(
-        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsB = std::accumulate(
-        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsC = std::accumulate(
-        c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
-
-    size_t sizeA = sizeof(ADataType) * elementsA;
-    size_t sizeB = sizeof(BDataType) * elementsB;
-    size_t sizeC = sizeof(CDataType) * elementsC;
-
-    ADataType* A = nullptr;
-    BDataType* B = nullptr;
-    CDataType* C = nullptr;
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC));
-
-    void *A_d, *B_d, *C_d;
-
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&C_d), sizeC));
-
-    /*******************
-   * Initialize data
-   *******************/
-    for(int64_t i = 0; i < elementsA; i++)
-    {
-        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsB; i++)
-    {
-        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsC; i++)
-    {
-        C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    /********************************************
-   * Transfer the Host Tensor to Device Memory *
-   ********************************************/
-    std::cout << "Initializing device data..." << std::endl;
-
-    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast<const void*>(C), sizeC, hipMemcpyHostToDevice));
-
-    /************************************************
-   * Retrieve the memory alignment for each tensor
-   ************************************************/
-
-    uint32_t alignmentRequirementA;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
-
-    uint32_t alignmentRequirementB;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
-
-    uint32_t alignmentRequirementC;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC));
-
-    /*******************************
-   * Create Contraction Descriptor
-   *******************************/
-
-    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
-    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
-    std::cout << "c_ms_ns: " << c_ms_ns << std::endl;
-
-    hiptensorContractionDescriptor_t desc;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
-                                                             &desc,
-                                                             &a_ms_ks,
-                                                             modeA.data(),
-                                                             alignmentRequirementA,
-                                                             &b_ns_ks,
-                                                             modeB.data(),
-                                                             alignmentRequirementB,
-                                                             &c_ms_ns,
-                                                             modeC.data(),
-                                                             alignmentRequirementC,
-                                                             &c_ms_ns,
-                                                             modeC.data(),
-                                                             alignmentRequirementC,
-                                                             typeCompute));
-    /**************************
-   * Set the algorithm to use
-   ***************************/
-
-    hiptensorContractionFind_t find;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
-
-    /**********************
-   * Query workspace
-   **********************/
-
-    uint64_t worksize = 0;
-    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
-        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
-
-    void* workspace = nullptr;
-
-    if(worksize > 0)
-    {
-        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
-    }
-
-    /**************************
-   * Create Contraction Plan
-   **************************/
-    std::cout << "Initializing contraction plan..." << std::endl;
-
-    hiptensorContractionPlan_t plan;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
-
-    std::cout << "Launching contraction kernel..." << std::endl;
-
-    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
-                                               &plan,
-                                               (void*)&alpha,
-                                               A_d,
-                                               B_d,
-                                               (void*)&beta,
-                                               C_d,
-                                               C_d,
-                                               workspace,
-                                               worksize,
-                                               0 /* stream */));
-
-#if !NDEBUG
-    bool printElements = false;
-    bool storeElements = false;
-
-    if(printElements || storeElements)
-    {
-        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
-    }
-
-    if(printElements)
-    {
-        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor A elements:\n";
-            hiptensorPrintArrayElements(std::cout, A, elementsA);
-            std::cout << std::endl;
-        }
-
-        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor B elements:\n";
-            hiptensorPrintArrayElements(std::cout, B, elementsB);
-            std::cout << std::endl;
-        }
-
-        if(elementsC < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor C elements:\n";
-            hiptensorPrintArrayElements(std::cout, C, elementsC);
-            std::cout << std::endl;
-        }
-    }
-
-    if(storeElements)
-    {
-        std::ofstream tensorA, tensorB, tensorC;
-        tensorA.open("tensor_A.txt");
-        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
-        tensorA.close();
-
-        tensorB.open("tensor_B.txt");
-        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
-        tensorB.close();
-
-        tensorC.open("tensor_C_scale_contraction_results.txt");
-        hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
-        tensorC.close();
-    }
-
-#endif
-
-    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
-
-    HIPTENSOR_FREE_HOST(A);
-    HIPTENSOR_FREE_HOST(B);
-    HIPTENSOR_FREE_HOST(C);
-
-    HIPTENSOR_FREE_DEVICE(A_d);
-    HIPTENSOR_FREE_DEVICE(B_d);
-    HIPTENSOR_FREE_DEVICE(C_d);
-    HIPTENSOR_FREE_DEVICE(workspace);
-
-    std::cout << "Finished!" << std::endl;
-
-    return 0;
+    constexpr hipDataType            typeA       = HIP_R_16BF;
+    constexpr hipDataType            typeB       = HIP_R_16BF;
+    constexpr hipDataType            typeC       = HIP_R_16BF;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     floatTypeCompute,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>();
 }
diff --git a/samples/01_contraction/simple_bilinear_contraction_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f16.cpp
index d9d044c9..40708c77 100644
--- a/samples/01_contraction/simple_bilinear_contraction_f16.cpp
+++ b/samples/01_contraction/simple_bilinear_contraction_f16.cpp
@@ -23,17 +23,7 @@
  * THE SOFTWARE.
  *
  *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <hiptensor/hiptensor.hpp>
-#include <hiptensor/hiptensor_types.hpp>
-#include <hiptensor/internal/hiptensor_utility.hpp>
-#include <iostream>
-#include <iterator>
-#include <numeric>
-#include <unordered_map>
-
-#include "common.hpp"
+#include "simple_bilinear_contraction.hpp"
 
 int main(int argc, char* argv[])
 {
@@ -51,292 +41,17 @@ int main(int argc, char* argv[])
     typedef _Float16 CDataType;
     typedef float    floatTypeCompute;
 
-    hipDataType            typeA       = HIP_R_16F;
-    hipDataType            typeB       = HIP_R_16F;
-    hipDataType            typeC       = HIP_R_16F;
-    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
-
-    floatTypeCompute alpha = (floatTypeCompute)1.1f;
-    floatTypeCompute beta  = (floatTypeCompute)1.0f;
-
-    /**********************
-   * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta *
-   *C_{m,n,u,v}
-   **********************/
-
-    std::vector<int> modeC{'m', 'n', 'u', 'v'};
-    std::vector<int> modeA{'m', 'n', 'h', 'k'};
-    std::vector<int> modeB{'u', 'v', 'h', 'k'};
-
-    int nmodeA = modeA.size();
-    int nmodeB = modeB.size();
-    int nmodeC = modeC.size();
-
-    std::unordered_map<int, int64_t> extent;
-
-    extent['m'] = 5;
-    extent['n'] = 6;
-    extent['u'] = 3;
-    extent['v'] = 4;
-    extent['h'] = 3;
-    extent['k'] = 4;
-
-    std::vector<int64_t> c_ms_ns_lengths;
-    for(auto mode : modeC)
-    {
-        c_ms_ns_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> a_ms_ks_lengths;
-    for(auto mode : modeA)
-    {
-        a_ms_ks_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> b_ns_ks_lengths;
-    for(auto mode : modeB)
-    {
-        b_ns_ks_lengths.push_back(extent[mode]);
-    }
-
-    hiptensorHandle_t* handle;
-    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
-
-    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
-
-    /********************************************
-   * Initialize tensors with the input lengths *
-   ********************************************/
-    hiptensorTensorDescriptor_t a_ms_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &a_ms_ks,
-                                                        nmodeA,
-                                                        a_ms_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeA,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t b_ns_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &b_ns_ks,
-                                                        nmodeB,
-                                                        b_ns_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeB,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t c_ms_ns;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &c_ms_ns,
-                                                        nmodeC,
-                                                        c_ms_ns_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeC,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    /**********************
-   * Allocating data
-   **********************/
-    std::cout << "Initializing host data..." << std::endl;
-
-    size_t elementsA = std::accumulate(
-        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsB = std::accumulate(
-        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsC = std::accumulate(
-        c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
-
-    size_t sizeA = sizeof(ADataType) * elementsA;
-    size_t sizeB = sizeof(BDataType) * elementsB;
-    size_t sizeC = sizeof(CDataType) * elementsC;
-
-    ADataType* A = nullptr;
-    BDataType* B = nullptr;
-    CDataType* C = nullptr;
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC));
-
-    void *A_d, *B_d, *C_d;
-
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&C_d), sizeC));
-
-    /*******************
-   * Initialize data
-   *******************/
-    for(int64_t i = 0; i < elementsA; i++)
-    {
-        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsB; i++)
-    {
-        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsC; i++)
-    {
-        C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    /********************************************
-   * Transfer the Host Tensor to Device Memory *
-   ********************************************/
-    std::cout << "Initializing device data..." << std::endl;
-
-    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast<const void*>(C), sizeC, hipMemcpyHostToDevice));
-
-    /************************************************
-   * Retrieve the memory alignment for each tensor
-   ************************************************/
-
-    uint32_t alignmentRequirementA;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
-
-    uint32_t alignmentRequirementB;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
-
-    uint32_t alignmentRequirementC;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC));
-
-    /*******************************
-   * Create Contraction Descriptor
-   *******************************/
-
-    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
-    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
-    std::cout << "c_ms_ns: " << c_ms_ns << std::endl;
-
-    hiptensorContractionDescriptor_t desc;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
-                                                             &desc,
-                                                             &a_ms_ks,
-                                                             modeA.data(),
-                                                             alignmentRequirementA,
-                                                             &b_ns_ks,
-                                                             modeB.data(),
-                                                             alignmentRequirementB,
-                                                             &c_ms_ns,
-                                                             modeC.data(),
-                                                             alignmentRequirementC,
-                                                             &c_ms_ns,
-                                                             modeC.data(),
-                                                             alignmentRequirementC,
-                                                             typeCompute));
-    /**************************
-   * Set the algorithm to use
-   ***************************/
-
-    hiptensorContractionFind_t find;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
-
-    /**********************
-   * Query workspace
-   **********************/
-
-    uint64_t worksize = 0;
-    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
-        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
-
-    void* workspace = nullptr;
-
-    if(worksize > 0)
-    {
-        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
-    }
-
-    /**************************
-   * Create Contraction Plan
-   **************************/
-    std::cout << "Initializing contraction plan..." << std::endl;
-
-    hiptensorContractionPlan_t plan;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
-
-    std::cout << "Launching contraction kernel..." << std::endl;
-
-    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
-                                               &plan,
-                                               (void*)&alpha,
-                                               A_d,
-                                               B_d,
-                                               (void*)&beta,
-                                               C_d,
-                                               C_d,
-                                               workspace,
-                                               worksize,
-                                               0 /* stream */));
-
-#if !NDEBUG
-    bool printElements = false;
-    bool storeElements = false;
-
-    if(printElements || storeElements)
-    {
-        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
-    }
-
-    if(printElements)
-    {
-        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor A elements:\n";
-            hiptensorPrintArrayElements(std::cout, A, elementsA);
-            std::cout << std::endl;
-        }
-
-        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor B elements:\n";
-            hiptensorPrintArrayElements(std::cout, B, elementsB);
-            std::cout << std::endl;
-        }
-
-        if(elementsC < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor C elements:\n";
-            hiptensorPrintArrayElements(std::cout, C, elementsC);
-            std::cout << std::endl;
-        }
-    }
-
-    if(storeElements)
-    {
-        std::ofstream tensorA, tensorB, tensorC;
-        tensorA.open("tensor_A.txt");
-        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
-        tensorA.close();
-
-        tensorB.open("tensor_B.txt");
-        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
-        tensorB.close();
-
-        tensorC.open("tensor_C_scale_contraction_results.txt");
-        hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
-        tensorC.close();
-    }
-
-#endif
-
-    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
-
-    HIPTENSOR_FREE_HOST(A);
-    HIPTENSOR_FREE_HOST(B);
-    HIPTENSOR_FREE_HOST(C);
-
-    HIPTENSOR_FREE_DEVICE(A_d);
-    HIPTENSOR_FREE_DEVICE(B_d);
-    HIPTENSOR_FREE_DEVICE(C_d);
-    HIPTENSOR_FREE_DEVICE(workspace);
-
-    std::cout << "Finished!" << std::endl;
-
-    return 0;
+    constexpr hipDataType            typeA       = HIP_R_16F;
+    constexpr hipDataType            typeB       = HIP_R_16F;
+    constexpr hipDataType            typeC       = HIP_R_16F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     floatTypeCompute,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>();
 }
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f32.cpp
index 5704a59d..ee046145 100644
--- a/samples/01_contraction/simple_bilinear_contraction_f32.cpp
+++ b/samples/01_contraction/simple_bilinear_contraction_f32.cpp
@@ -23,17 +23,7 @@
  * THE SOFTWARE.
  *
  *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <hiptensor/hiptensor.hpp>
-#include <hiptensor/hiptensor_types.hpp>
-#include <hiptensor/internal/hiptensor_utility.hpp>
-#include <iostream>
-#include <iterator>
-#include <numeric>
-#include <unordered_map>
-
-#include "common.hpp"
+#include "simple_bilinear_contraction.hpp"
 
 int main(int argc, char* argv[])
 {
@@ -51,292 +41,17 @@ int main(int argc, char* argv[])
     typedef float CDataType;
     typedef float floatTypeCompute;
 
-    hipDataType            typeA       = HIP_R_32F;
-    hipDataType            typeB       = HIP_R_32F;
-    hipDataType            typeC       = HIP_R_32F;
-    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
-
-    floatTypeCompute alpha = (floatTypeCompute)1.1f;
-    floatTypeCompute beta  = (floatTypeCompute)1.0f;
-
-    /**********************
-   * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta *
-   *C_{m,n,u,v}
-   **********************/
-
-    std::vector<int> modeC{'m', 'n', 'u', 'v'};
-    std::vector<int> modeA{'m', 'n', 'h', 'k'};
-    std::vector<int> modeB{'u', 'v', 'h', 'k'};
-
-    int nmodeA = modeA.size();
-    int nmodeB = modeB.size();
-    int nmodeC = modeC.size();
-
-    std::unordered_map<int, int64_t> extent;
-
-    extent['m'] = 5;
-    extent['n'] = 6;
-    extent['u'] = 3;
-    extent['v'] = 4;
-    extent['h'] = 3;
-    extent['k'] = 4;
-
-    std::vector<int64_t> c_ms_ns_lengths;
-    for(auto mode : modeC)
-    {
-        c_ms_ns_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> a_ms_ks_lengths;
-    for(auto mode : modeA)
-    {
-        a_ms_ks_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> b_ns_ks_lengths;
-    for(auto mode : modeB)
-    {
-        b_ns_ks_lengths.push_back(extent[mode]);
-    }
-
-    hiptensorHandle_t* handle;
-    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
-
-    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
-
-    /********************************************
-   * Initialize tensors with the input lengths *
-   ********************************************/
-    hiptensorTensorDescriptor_t a_ms_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &a_ms_ks,
-                                                        nmodeA,
-                                                        a_ms_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeA,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t b_ns_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &b_ns_ks,
-                                                        nmodeB,
-                                                        b_ns_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeB,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t c_ms_ns;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &c_ms_ns,
-                                                        nmodeC,
-                                                        c_ms_ns_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeC,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    /**********************
-   * Allocating data
-   **********************/
-    std::cout << "Initializing host data..." << std::endl;
-
-    size_t elementsA = std::accumulate(
-        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsB = std::accumulate(
-        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsC = std::accumulate(
-        c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
-
-    size_t sizeA = sizeof(ADataType) * elementsA;
-    size_t sizeB = sizeof(BDataType) * elementsB;
-    size_t sizeC = sizeof(CDataType) * elementsC;
-
-    ADataType* A = nullptr;
-    BDataType* B = nullptr;
-    CDataType* C = nullptr;
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC));
-
-    void *A_d, *B_d, *C_d;
-
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&C_d), sizeC));
-
-    /*******************
-   * Initialize data
-   *******************/
-    for(int64_t i = 0; i < elementsA; i++)
-    {
-        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsB; i++)
-    {
-        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsC; i++)
-    {
-        C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    /********************************************
-   * Transfer the Host Tensor to Device Memory *
-   ********************************************/
-    std::cout << "Initializing device data..." << std::endl;
-
-    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast<const void*>(C), sizeC, hipMemcpyHostToDevice));
-
-    /************************************************
-   * Retrieve the memory alignment for each tensor
-   ************************************************/
-
-    uint32_t alignmentRequirementA;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
-
-    uint32_t alignmentRequirementB;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
-
-    uint32_t alignmentRequirementC;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC));
-
-    /*******************************
-   * Create Contraction Descriptor
-   *******************************/
-
-    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
-    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
-    std::cout << "c_ms_ns: " << c_ms_ns << std::endl;
-
-    hiptensorContractionDescriptor_t desc;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
-                                                             &desc,
-                                                             &a_ms_ks,
-                                                             modeA.data(),
-                                                             alignmentRequirementA,
-                                                             &b_ns_ks,
-                                                             modeB.data(),
-                                                             alignmentRequirementB,
-                                                             &c_ms_ns,
-                                                             modeC.data(),
-                                                             alignmentRequirementC,
-                                                             &c_ms_ns,
-                                                             modeC.data(),
-                                                             alignmentRequirementC,
-                                                             typeCompute));
-    /**************************
-   * Set the algorithm to use
-   ***************************/
-
-    hiptensorContractionFind_t find;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
-
-    /**********************
-   * Query workspace
-   **********************/
-
-    uint64_t worksize = 0;
-    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
-        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
-
-    void* workspace = nullptr;
-
-    if(worksize > 0)
-    {
-        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
-    }
-
-    /**************************
-   * Create Contraction Plan
-   **************************/
-    std::cout << "Initializing contraction plan..." << std::endl;
-
-    hiptensorContractionPlan_t plan;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
-
-    std::cout << "Launching contraction kernel..." << std::endl;
-
-    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
-                                               &plan,
-                                               (void*)&alpha,
-                                               A_d,
-                                               B_d,
-                                               (void*)&beta,
-                                               C_d,
-                                               C_d,
-                                               workspace,
-                                               worksize,
-                                               0 /* stream */));
-
-#if !NDEBUG
-    bool printElements = false;
-    bool storeElements = false;
-
-    if(printElements || storeElements)
-    {
-        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
-    }
-
-    if(printElements)
-    {
-        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor A elements:\n";
-            hiptensorPrintArrayElements(std::cout, A, elementsA);
-            std::cout << std::endl;
-        }
-
-        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor B elements:\n";
-            hiptensorPrintArrayElements(std::cout, B, elementsB);
-            std::cout << std::endl;
-        }
-
-        if(elementsC < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor C elements:\n";
-            hiptensorPrintArrayElements(std::cout, C, elementsC);
-            std::cout << std::endl;
-        }
-    }
-
-    if(storeElements)
-    {
-        std::ofstream tensorA, tensorB, tensorC;
-        tensorA.open("tensor_A.txt");
-        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
-        tensorA.close();
-
-        tensorB.open("tensor_B.txt");
-        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
-        tensorB.close();
-
-        tensorC.open("tensor_C_scale_contraction_results.txt");
-        hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
-        tensorC.close();
-    }
-
-#endif
-
-    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
-
-    HIPTENSOR_FREE_HOST(A);
-    HIPTENSOR_FREE_HOST(B);
-    HIPTENSOR_FREE_HOST(C);
-
-    HIPTENSOR_FREE_DEVICE(A_d);
-    HIPTENSOR_FREE_DEVICE(B_d);
-    HIPTENSOR_FREE_DEVICE(C_d);
-    HIPTENSOR_FREE_DEVICE(workspace);
-
-    std::cout << "Finished!" << std::endl;
-
-    return 0;
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeC       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     floatTypeCompute,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>();
 }
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp
new file mode 100644
index 00000000..42f60ecb
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_bilinear_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef float        ADataType;
+    typedef float        BDataType;
+    typedef float        CDataType;
+    typedef hip_bfloat16 floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeC       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF;
+
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     floatTypeCompute,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>();
+}
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp
new file mode 100644
index 00000000..d39a4fca
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_bilinear_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef float    ADataType;
+    typedef float    BDataType;
+    typedef float    CDataType;
+    typedef _Float16 floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeC       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F;
+
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     floatTypeCompute,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>();
+}
diff --git a/samples/01_contraction/simple_bilinear_contraction_f64.cpp b/samples/01_contraction/simple_bilinear_contraction_f64.cpp
new file mode 100644
index 00000000..412ebbc5
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_f64.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_bilinear_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef double ADataType;
+    typedef double BDataType;
+    typedef double CDataType;
+    typedef double floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_64F;
+    constexpr hipDataType            typeB       = HIP_R_64F;
+    constexpr hipDataType            typeC       = HIP_R_64F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F;
+
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     floatTypeCompute,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>();
+}
diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp
new file mode 100644
index 00000000..673c4768
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_bilinear_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef double ADataType;
+    typedef double BDataType;
+    typedef double CDataType;
+    typedef float  floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_64F;
+    constexpr hipDataType            typeB       = HIP_R_64F;
+    constexpr hipDataType            typeC       = HIP_R_64F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     floatTypeCompute,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>();
+}
diff --git a/samples/01_contraction/simple_scale_contraction.hpp b/samples/01_contraction/simple_scale_contraction.hpp
new file mode 100644
index 00000000..e9d482c3
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction.hpp
@@ -0,0 +1,341 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include <algorithm>
+#include <fstream>
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
+#include <hiptensor/internal/hiptensor_utility.hpp>
+#include <iterator>
+#include <numeric>
+#include <unordered_map>
+
+#include "common.hpp"
+
+template <typename ADataType,
+          typename BDataType,
+          typename DDataType,
+          typename floatTypeCompute,
+          hipDataType            typeA,
+          hipDataType            typeB,
+          hipDataType            typeD,
+          hiptensorComputeType_t typeCompute>
+int scaleContractionSample()
+{
+    floatTypeCompute alpha = (floatTypeCompute)1.0f;
+    /**********************
+   * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v}
+   **********************/
+
+    std::vector<int> modeD{'m', 'n', 'u', 'v'};
+    std::vector<int> modeA{'m', 'n', 'h', 'k'};
+    std::vector<int> modeB{'u', 'v', 'h', 'k'};
+
+    int nmodeA = modeA.size();
+    int nmodeB = modeB.size();
+    int nmodeD = modeD.size();
+
+    std::unordered_map<int, int64_t> extent;
+
+    extent['m'] = 4;
+    extent['n'] = 3;
+    extent['u'] = 4;
+    extent['v'] = 3;
+    extent['h'] = 6;
+    extent['k'] = 5;
+
+    std::vector<int64_t> d_ms_ns_lengths;
+    for(auto mode : modeD)
+    {
+        d_ms_ns_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> a_ms_ks_lengths;
+    for(auto mode : modeA)
+    {
+        a_ms_ks_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> b_ns_ks_lengths;
+    for(auto mode : modeB)
+    {
+        b_ns_ks_lengths.push_back(extent[mode]);
+    }
+
+    hiptensorHandle_t* handle;
+    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
+
+    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
+
+    /********************************************
+   * Initialize tensors with the input lengths *
+   ********************************************/
+    hiptensorTensorDescriptor_t a_ms_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &a_ms_ks,
+                                                        nmodeA,
+                                                        a_ms_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeA,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t b_ns_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &b_ns_ks,
+                                                        nmodeB,
+                                                        b_ns_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeB,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t d_ms_ns;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &d_ms_ns,
+                                                        nmodeD,
+                                                        d_ms_ns_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeD,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    /**********************
+   * Allocating data
+   **********************/
+    std::cout << "Initializing host data..." << std::endl;
+
+    size_t elementsA = std::accumulate(
+        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsB = std::accumulate(
+        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsD = std::accumulate(
+        d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
+
+    size_t sizeA = sizeof(ADataType) * elementsA;
+    size_t sizeB = sizeof(BDataType) * elementsB;
+    size_t sizeD = sizeof(DDataType) * elementsD;
+
+    ADataType* A = nullptr;
+    BDataType* B = nullptr;
+    DDataType* D = nullptr;
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD));
+
+    void *A_d, *B_d, *D_d;
+
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&D_d), sizeD));
+
+    /*******************
+   * Initialize data
+   *******************/
+    int initMethod = 0; // TODO read the value from command line
+    for(int64_t i = 0; i < elementsA; i++)
+    {
+        if(initMethod == 0)
+        {
+            A[i] = ADataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100;
+        }
+        else
+        {
+            A[i] = (ADataType)(float(i) / 100);
+        }
+    }
+
+    for(int64_t i = 0; i < elementsB; i++)
+    {
+        if(initMethod == 0)
+        {
+            B[i] = BDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100;
+        }
+        else
+        {
+            B[i] = (BDataType)(float(i) / 100);
+        }
+    }
+
+    for(int64_t i = 0; i < elementsD; i++)
+    {
+        D[i] = std::numeric_limits<DDataType>::signaling_NaN();
+    }
+
+    /********************************************
+   * Transfer the Host Tensor to Device Memory *
+   ********************************************/
+    std::cout << "Initializing device data..." << std::endl;
+
+    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD));
+
+    /************************************************
+   * Retrieve the memory alignment for each tensor
+   ************************************************/
+    uint32_t alignmentRequirementA;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
+
+    uint32_t alignmentRequirementB;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
+
+    uint32_t alignmentRequirementD;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD));
+
+    /*******************************
+   * Create Contraction Descriptor
+   *******************************/
+
+    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
+    std::cout << "d_ms_ns: " << d_ms_ns << std::endl;
+
+    hiptensorContractionDescriptor_t desc;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
+                                                             &desc,
+                                                             &a_ms_ks,
+                                                             modeA.data(),
+                                                             alignmentRequirementA,
+                                                             &b_ns_ks,
+                                                             modeB.data(),
+                                                             alignmentRequirementB,
+                                                             nullptr,
+                                                             nullptr,
+                                                             0,
+                                                             &d_ms_ns,
+                                                             modeD.data(),
+                                                             alignmentRequirementD,
+                                                             typeCompute));
+    /**************************
+   * Set the algorithm to use
+   ***************************/
+
+    hiptensorContractionFind_t find;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
+
+    /**********************
+   * Query workspace
+   **********************/
+
+    uint64_t worksize = 0;
+    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
+        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
+
+    void* workspace = nullptr;
+
+    if(worksize > 0)
+    {
+        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
+    }
+
+    /**************************
+   * Create Contraction Plan
+   **************************/
+    std::cout << "Initializing contraction plan..." << std::endl;
+
+    hiptensorContractionPlan_t plan;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
+
+    std::cout << "Launching contraction kernel..." << std::endl;
+
+    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
+                                               &plan,
+                                               (void*)&alpha,
+                                               A_d,
+                                               B_d,
+                                               nullptr,
+                                               nullptr,
+                                               D_d,
+                                               workspace,
+                                               worksize,
+                                               0 /* stream */));
+
+#if !NDEBUG
+    bool printElements = false;
+    bool storeElements = false;
+
+    if(printElements || storeElements)
+    {
+        CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
+    }
+
+    if(printElements)
+    {
+        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor A elements:\n";
+            hiptensorPrintArrayElements(std::cout, A, elementsA);
+            std::cout << std::endl;
+        }
+
+        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor B elements:\n";
+            hiptensorPrintArrayElements(std::cout, B, elementsB);
+            std::cout << std::endl;
+        }
+
+        if(elementsD < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor D elements:\n";
+            hiptensorPrintArrayElements(std::cout, D, elementsD);
+            std::cout << std::endl;
+        }
+    }
+
+    if(storeElements)
+    {
+        std::ofstream tensorA, tensorB, tensorD;
+        tensorA.open("tensor_A.txt");
+        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
+        tensorA.close();
+
+        tensorB.open("tensor_B.txt");
+        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
+        tensorB.close();
+
+        tensorD.open("tensor_D_scale_contraction_results.txt");
+        hiptensorPrintElementsToFile(tensorD, D, elementsD, ", ");
+        tensorD.close();
+    }
+
+#endif
+
+    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
+
+    HIPTENSOR_FREE_HOST(A);
+    HIPTENSOR_FREE_HOST(B);
+    HIPTENSOR_FREE_HOST(D);
+
+    HIPTENSOR_FREE_DEVICE(A_d);
+    HIPTENSOR_FREE_DEVICE(B_d);
+    HIPTENSOR_FREE_DEVICE(D_d);
+    HIPTENSOR_FREE_DEVICE(workspace);
+
+    std::cout << "Finished!" << std::endl;
+
+    return 0;
+}
diff --git a/samples/01_contraction/simple_scale_contraction_bf16.cpp b/samples/01_contraction/simple_scale_contraction_bf16.cpp
index e05916bf..7b0f8b6c 100644
--- a/samples/01_contraction/simple_scale_contraction_bf16.cpp
+++ b/samples/01_contraction/simple_scale_contraction_bf16.cpp
@@ -23,16 +23,7 @@
  * THE SOFTWARE.
  *
  *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <hiptensor/hiptensor.hpp>
-#include <hiptensor/hiptensor_types.hpp>
-#include <hiptensor/internal/hiptensor_utility.hpp>
-#include <iterator>
-#include <numeric>
-#include <unordered_map>
-
-#include "common.hpp"
+#include "simple_scale_contraction.hpp"
 
 int main(int argc, char* argv[])
 {
@@ -44,291 +35,17 @@ int main(int argc, char* argv[])
     typedef hip_bfloat16 DDataType;
     typedef float        floatTypeCompute;
 
-    hipDataType            typeA       = HIP_R_16BF;
-    hipDataType            typeB       = HIP_R_16BF;
-    hipDataType            typeD       = HIP_R_16BF;
-    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
-
-    floatTypeCompute alpha = (floatTypeCompute)1.0f;
-
-    /**********************
-   * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v}
-   **********************/
-
-    std::vector<int> modeD{'m', 'n', 'u', 'v'};
-    std::vector<int> modeA{'m', 'n', 'h', 'k'};
-    std::vector<int> modeB{'u', 'v', 'h', 'k'};
-
-    int nmodeA = modeA.size();
-    int nmodeB = modeB.size();
-    int nmodeD = modeD.size();
-
-    std::unordered_map<int, int64_t> extent;
-
-    extent['m'] = 5;
-    extent['n'] = 6;
-    extent['u'] = 3;
-    extent['v'] = 4;
-    extent['h'] = 3;
-    extent['k'] = 4;
-
-    std::vector<int64_t> d_ms_ns_lengths;
-    for(auto mode : modeD)
-    {
-        d_ms_ns_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> a_ms_ks_lengths;
-    for(auto mode : modeA)
-    {
-        a_ms_ks_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> b_ns_ks_lengths;
-    for(auto mode : modeB)
-    {
-        b_ns_ks_lengths.push_back(extent[mode]);
-    }
-
-    hiptensorHandle_t* handle;
-    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
-
-    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
-
-    /********************************************
-   * Initialize tensors with the input lengths *
-   ********************************************/
-    hiptensorTensorDescriptor_t a_ms_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &a_ms_ks,
-                                                        nmodeA,
-                                                        a_ms_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeA,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t b_ns_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &b_ns_ks,
-                                                        nmodeB,
-                                                        b_ns_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeB,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t d_ms_ns;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &d_ms_ns,
-                                                        nmodeD,
-                                                        d_ms_ns_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeD,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    /**********************
-   * Allocating data
-   **********************/
-    std::cout << "Initializing host data..." << std::endl;
-
-    size_t elementsA = std::accumulate(
-        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsB = std::accumulate(
-        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsD = std::accumulate(
-        d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
-
-    size_t sizeA = sizeof(ADataType) * elementsA;
-    size_t sizeB = sizeof(BDataType) * elementsB;
-    size_t sizeD = sizeof(DDataType) * elementsD;
-
-    ADataType* A = nullptr;
-    BDataType* B = nullptr;
-    DDataType* D = nullptr;
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD));
-
-    void *A_d, *B_d, *D_d;
-
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&D_d), sizeD));
-
-    /*******************
-   * Initialize data
-   *******************/
-    for(int64_t i = 0; i < elementsA; i++)
-    {
-        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsB; i++)
-    {
-        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsD; i++)
-    {
-        D[i] = std::numeric_limits<DDataType>::signaling_NaN();
-    }
-
-    /********************************************
-   * Transfer the Host Tensor to Device Memory *
-   ********************************************/
-    std::cout << "Initializing device data..." << std::endl;
-
-    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD));
-
-    /************************************************
-   * Retrieve the memory alignment for each tensor
-   ************************************************/
-    uint32_t alignmentRequirementA;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
-
-    uint32_t alignmentRequirementB;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
-
-    uint32_t alignmentRequirementD;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD));
-
-    /*******************************
-   * Create Contraction Descriptor
-   *******************************/
-
-    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
-    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
-    std::cout << "d_ms_ns: " << d_ms_ns << std::endl;
-
-    hiptensorContractionDescriptor_t desc;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
-                                                             &desc,
-                                                             &a_ms_ks,
-                                                             modeA.data(),
-                                                             alignmentRequirementA,
-                                                             &b_ns_ks,
-                                                             modeB.data(),
-                                                             alignmentRequirementB,
-                                                             nullptr,
-                                                             nullptr,
-                                                             0,
-                                                             &d_ms_ns,
-                                                             modeD.data(),
-                                                             alignmentRequirementD,
-                                                             typeCompute));
-    /**************************
-   * Set the algorithm to use
-   ***************************/
-
-    hiptensorContractionFind_t find;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
-
-    /**********************
-   * Query workspace
-   **********************/
-
-    uint64_t worksize = 0;
-    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
-        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
-
-    void* workspace = nullptr;
-
-    if(worksize > 0)
-    {
-        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
-    }
-
-    /**************************
-   * Create Contraction Plan
-   **************************/
-    std::cout << "Initializing contraction plan..." << std::endl;
-
-    hiptensorContractionPlan_t plan;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
-
-    std::cout << "Launching contraction kernel..." << std::endl;
-
-    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
-                                               &plan,
-                                               (void*)&alpha,
-                                               A_d,
-                                               B_d,
-                                               nullptr,
-                                               nullptr,
-                                               D_d,
-                                               workspace,
-                                               worksize,
-                                               0 /* stream */));
-
-    CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
-
-#if !NDEBUG
-    bool printElements = false;
-    bool storeElements = false;
-
-    if(printElements || storeElements)
-    {
-        CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
-    }
-
-    if(printElements)
-    {
-        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor A elements:\n";
-            hiptensorPrintArrayElements(std::cout, A, elementsA);
-            std::cout << std::endl;
-        }
-
-        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor B elements:\n";
-            hiptensorPrintArrayElements(std::cout, B, elementsB);
-            std::cout << std::endl;
-        }
-
-        if(elementsD < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor D elements:\n";
-            hiptensorPrintArrayElements(std::cout, D, elementsD);
-            std::cout << std::endl;
-        }
-    }
-
-    if(storeElements)
-    {
-        std::ofstream tensorA, tensorB, tensorD;
-        tensorA.open("tensor_A.txt");
-        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
-        tensorA.close();
-
-        tensorB.open("tensor_B.txt");
-        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
-        tensorB.close();
-
-        tensorD.open("tensor_D_scale_contraction_results.txt");
-        hiptensorPrintElementsToFile(tensorD, D, elementsD, ", ");
-        tensorD.close();
-    }
-
-#endif
-
-    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
-
-    HIPTENSOR_FREE_HOST(A);
-    HIPTENSOR_FREE_HOST(B);
-    HIPTENSOR_FREE_HOST(D);
-
-    HIPTENSOR_FREE_DEVICE(A_d);
-    HIPTENSOR_FREE_DEVICE(B_d);
-    HIPTENSOR_FREE_DEVICE(D_d);
-    HIPTENSOR_FREE_DEVICE(workspace);
-
-    std::cout << "Finished!" << std::endl;
-
-    return 0;
+    constexpr hipDataType            typeA       = HIP_R_16BF;
+    constexpr hipDataType            typeB       = HIP_R_16BF;
+    constexpr hipDataType            typeD       = HIP_R_16BF;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  floatTypeCompute,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>();
 }
diff --git a/samples/01_contraction/simple_scale_contraction_f16.cpp b/samples/01_contraction/simple_scale_contraction_f16.cpp
index 1e62be85..d69193f0 100644
--- a/samples/01_contraction/simple_scale_contraction_f16.cpp
+++ b/samples/01_contraction/simple_scale_contraction_f16.cpp
@@ -23,312 +23,35 @@
  * THE SOFTWARE.
  *
  *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <hiptensor/hiptensor.hpp>
-#include <hiptensor/hiptensor_types.hpp>
-#include <hiptensor/internal/hiptensor_utility.hpp>
-#include <iterator>
-#include <numeric>
-#include <unordered_map>
-
-#include "common.hpp"
+#include "simple_scale_contraction.hpp"
 
 int main(int argc, char* argv[])
 {
     /***************************************
    * Check device support                 *
    **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
     typedef _Float16 ADataType;
     typedef _Float16 BDataType;
     typedef _Float16 DDataType;
     typedef float    floatTypeCompute;
 
-    hipDataType            typeA       = HIP_R_16F;
-    hipDataType            typeB       = HIP_R_16F;
-    hipDataType            typeD       = HIP_R_16F;
-    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
-
-    floatTypeCompute alpha = (floatTypeCompute)1.0f;
-
-    /**********************
-   * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v}
-   **********************/
-
-    std::vector<int> modeD{'m', 'n', 'u', 'v'};
-    std::vector<int> modeA{'m', 'n', 'h', 'k'};
-    std::vector<int> modeB{'u', 'v', 'h', 'k'};
-
-    int nmodeA = modeA.size();
-    int nmodeB = modeB.size();
-    int nmodeD = modeD.size();
-
-    std::unordered_map<int, int64_t> extent;
-
-    extent['m'] = 5;
-    extent['n'] = 6;
-    extent['u'] = 3;
-    extent['v'] = 4;
-    extent['h'] = 3;
-    extent['k'] = 4;
-
-    std::vector<int64_t> d_ms_ns_lengths;
-    for(auto mode : modeD)
-    {
-        d_ms_ns_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> a_ms_ks_lengths;
-    for(auto mode : modeA)
-    {
-        a_ms_ks_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> b_ns_ks_lengths;
-    for(auto mode : modeB)
-    {
-        b_ns_ks_lengths.push_back(extent[mode]);
-    }
-
-    hiptensorHandle_t* handle;
-    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
-
-    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
-
-    /********************************************
-   * Initialize tensors with the input lengths *
-   ********************************************/
-    hiptensorTensorDescriptor_t a_ms_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &a_ms_ks,
-                                                        nmodeA,
-                                                        a_ms_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeA,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t b_ns_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &b_ns_ks,
-                                                        nmodeB,
-                                                        b_ns_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeB,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t d_ms_ns;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &d_ms_ns,
-                                                        nmodeD,
-                                                        d_ms_ns_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeD,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    /**********************
-   * Allocating data
-   **********************/
-    std::cout << "Initializing host data..." << std::endl;
-
-    size_t elementsA = std::accumulate(
-        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsB = std::accumulate(
-        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsD = std::accumulate(
-        d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
-
-    size_t sizeA = sizeof(ADataType) * elementsA;
-    size_t sizeB = sizeof(BDataType) * elementsB;
-    size_t sizeD = sizeof(DDataType) * elementsD;
-
-    ADataType* A = nullptr;
-    BDataType* B = nullptr;
-    DDataType* D = nullptr;
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD));
-
-    void *A_d, *B_d, *D_d;
-
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&D_d), sizeD));
-
-    /*******************
-   * Initialize data
-   *******************/
-    for(int64_t i = 0; i < elementsA; i++)
-    {
-        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsB; i++)
-    {
-        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsD; i++)
-    {
-        D[i] = std::numeric_limits<DDataType>::signaling_NaN();
-    }
-
-    /********************************************
-   * Transfer the Host Tensor to Device Memory *
-   ********************************************/
-    std::cout << "Initializing device data..." << std::endl;
-
-    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD));
-
-    /************************************************
-   * Retrieve the memory alignment for each tensor
-   ************************************************/
-    uint32_t alignmentRequirementA;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
-
-    uint32_t alignmentRequirementB;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
-
-    uint32_t alignmentRequirementD;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD));
-
-    /*******************************
-   * Create Contraction Descriptor
-   *******************************/
-
-    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
-    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
-    std::cout << "d_ms_ns: " << d_ms_ns << std::endl;
-
-    hiptensorContractionDescriptor_t desc;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
-                                                             &desc,
-                                                             &a_ms_ks,
-                                                             modeA.data(),
-                                                             alignmentRequirementA,
-                                                             &b_ns_ks,
-                                                             modeB.data(),
-                                                             alignmentRequirementB,
-                                                             nullptr,
-                                                             nullptr,
-                                                             0,
-                                                             &d_ms_ns,
-                                                             modeD.data(),
-                                                             alignmentRequirementD,
-                                                             typeCompute));
-    /**************************
-   * Set the algorithm to use
-   ***************************/
-
-    hiptensorContractionFind_t find;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
-
-    /**********************
-   * Query workspace
-   **********************/
-
-    uint64_t worksize = 0;
-    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
-        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
-
-    void* workspace = nullptr;
-
-    if(worksize > 0)
-    {
-        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
-    }
-
-    /**************************
-   * Create Contraction Plan
-   **************************/
-    std::cout << "Initializing contraction plan..." << std::endl;
-
-    hiptensorContractionPlan_t plan;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
-
-    std::cout << "Launching contraction kernel..." << std::endl;
-
-    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
-                                               &plan,
-                                               (void*)&alpha,
-                                               A_d,
-                                               B_d,
-                                               nullptr,
-                                               nullptr,
-                                               D_d,
-                                               workspace,
-                                               worksize,
-                                               0 /* stream */));
-
-    CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
-
-#if !NDEBUG
-    bool printElements = false;
-    bool storeElements = false;
-
-    if(printElements || storeElements)
-    {
-        CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
-    }
-
-    if(printElements)
-    {
-        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor A elements:\n";
-            hiptensorPrintArrayElements(std::cout, A, elementsA);
-            std::cout << std::endl;
-        }
-
-        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor B elements:\n";
-            hiptensorPrintArrayElements(std::cout, B, elementsB);
-            std::cout << std::endl;
-        }
-
-        if(elementsD < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor D elements:\n";
-            hiptensorPrintArrayElements(std::cout, D, elementsD);
-            std::cout << std::endl;
-        }
-    }
-
-    if(storeElements)
-    {
-        std::ofstream tensorA, tensorB, tensorD;
-        tensorA.open("tensor_A.txt");
-        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
-        tensorA.close();
-
-        tensorB.open("tensor_B.txt");
-        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
-        tensorB.close();
-
-        tensorD.open("tensor_D_scale_contraction_results.txt");
-        hiptensorPrintElementsToFile(tensorD, D, elementsD, ", ");
-        tensorD.close();
-    }
-
-#endif
-
-    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
-
-    HIPTENSOR_FREE_HOST(A);
-    HIPTENSOR_FREE_HOST(B);
-    HIPTENSOR_FREE_HOST(D);
-
-    HIPTENSOR_FREE_DEVICE(A_d);
-    HIPTENSOR_FREE_DEVICE(B_d);
-    HIPTENSOR_FREE_DEVICE(D_d);
-    HIPTENSOR_FREE_DEVICE(workspace);
-
-    std::cout << "Finished!" << std::endl;
-
-    return 0;
+    constexpr hipDataType            typeA       = HIP_R_16F;
+    constexpr hipDataType            typeB       = HIP_R_16F;
+    constexpr hipDataType            typeD       = HIP_R_16F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  floatTypeCompute,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>();
 }
diff --git a/samples/01_contraction/simple_scale_contraction_f32.cpp b/samples/01_contraction/simple_scale_contraction_f32.cpp
index c76ec370..e53cc468 100644
--- a/samples/01_contraction/simple_scale_contraction_f32.cpp
+++ b/samples/01_contraction/simple_scale_contraction_f32.cpp
@@ -23,16 +23,8 @@
  * THE SOFTWARE.
  *
  *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <hiptensor/hiptensor.hpp>
-#include <hiptensor/hiptensor_types.hpp>
-#include <hiptensor/internal/hiptensor_utility.hpp>
-#include <iterator>
-#include <numeric>
-#include <unordered_map>
 
-#include "common.hpp"
+#include "simple_scale_contraction.hpp"
 
 int main(int argc, char* argv[])
 {
@@ -50,291 +42,17 @@ int main(int argc, char* argv[])
     typedef float DDataType;
     typedef float floatTypeCompute;
 
-    hipDataType            typeA       = HIP_R_32F;
-    hipDataType            typeB       = HIP_R_32F;
-    hipDataType            typeD       = HIP_R_32F;
-    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
-
-    floatTypeCompute alpha = (floatTypeCompute)1.0f;
-
-    /**********************
-   * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v}
-   **********************/
-
-    std::vector<int> modeD{'m', 'n', 'u', 'v'};
-    std::vector<int> modeA{'m', 'n', 'h', 'k'};
-    std::vector<int> modeB{'u', 'v', 'h', 'k'};
-
-    int nmodeA = modeA.size();
-    int nmodeB = modeB.size();
-    int nmodeD = modeD.size();
-
-    std::unordered_map<int, int64_t> extent;
-
-    extent['m'] = 5;
-    extent['n'] = 6;
-    extent['u'] = 3;
-    extent['v'] = 4;
-    extent['h'] = 3;
-    extent['k'] = 4;
-
-    std::vector<int64_t> d_ms_ns_lengths;
-    for(auto mode : modeD)
-    {
-        d_ms_ns_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> a_ms_ks_lengths;
-    for(auto mode : modeA)
-    {
-        a_ms_ks_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> b_ns_ks_lengths;
-    for(auto mode : modeB)
-    {
-        b_ns_ks_lengths.push_back(extent[mode]);
-    }
-
-    hiptensorHandle_t* handle;
-    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
-
-    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
-
-    /********************************************
-   * Initialize tensors with the input lengths *
-   ********************************************/
-    hiptensorTensorDescriptor_t a_ms_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &a_ms_ks,
-                                                        nmodeA,
-                                                        a_ms_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeA,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t b_ns_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &b_ns_ks,
-                                                        nmodeB,
-                                                        b_ns_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeB,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t d_ms_ns;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &d_ms_ns,
-                                                        nmodeD,
-                                                        d_ms_ns_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeD,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    /**********************
-   * Allocating data
-   **********************/
-    std::cout << "Initializing host data..." << std::endl;
-
-    size_t elementsA = std::accumulate(
-        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsB = std::accumulate(
-        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsD = std::accumulate(
-        d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
-
-    size_t sizeA = sizeof(ADataType) * elementsA;
-    size_t sizeB = sizeof(BDataType) * elementsB;
-    size_t sizeD = sizeof(DDataType) * elementsD;
-
-    ADataType* A = nullptr;
-    BDataType* B = nullptr;
-    DDataType* D = nullptr;
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD));
-
-    void *A_d, *B_d, *D_d;
-
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&D_d), sizeD));
-
-    /*******************
-   * Initialize data
-   *******************/
-    for(int64_t i = 0; i < elementsA; i++)
-    {
-        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsB; i++)
-    {
-        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsD; i++)
-    {
-        D[i] = std::numeric_limits<DDataType>::signaling_NaN();
-    }
-
-    /********************************************
-   * Transfer the Host Tensor to Device Memory *
-   ********************************************/
-    std::cout << "Initializing device data..." << std::endl;
-
-    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD));
-
-    /************************************************
-   * Retrieve the memory alignment for each tensor
-   ************************************************/
-    uint32_t alignmentRequirementA;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
-
-    uint32_t alignmentRequirementB;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
-
-    uint32_t alignmentRequirementD;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD));
-
-    /*******************************
-   * Create Contraction Descriptor
-   *******************************/
-
-    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
-    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
-    std::cout << "d_ms_ns: " << d_ms_ns << std::endl;
-
-    hiptensorContractionDescriptor_t desc;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
-                                                             &desc,
-                                                             &a_ms_ks,
-                                                             modeA.data(),
-                                                             alignmentRequirementA,
-                                                             &b_ns_ks,
-                                                             modeB.data(),
-                                                             alignmentRequirementB,
-                                                             nullptr,
-                                                             nullptr,
-                                                             0,
-                                                             &d_ms_ns,
-                                                             modeD.data(),
-                                                             alignmentRequirementD,
-                                                             typeCompute));
-    /**************************
-   * Set the algorithm to use
-   ***************************/
-
-    hiptensorContractionFind_t find;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
-
-    /**********************
-   * Query workspace
-   **********************/
-
-    uint64_t worksize = 0;
-    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
-        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
-
-    void* workspace = nullptr;
-
-    if(worksize > 0)
-    {
-        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
-    }
-
-    /**************************
-   * Create Contraction Plan
-   **************************/
-    std::cout << "Initializing contraction plan..." << std::endl;
-
-    hiptensorContractionPlan_t plan;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
-
-    std::cout << "Launching contraction kernel..." << std::endl;
-
-    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
-                                               &plan,
-                                               (void*)&alpha,
-                                               A_d,
-                                               B_d,
-                                               nullptr,
-                                               nullptr,
-                                               D_d,
-                                               workspace,
-                                               worksize,
-                                               0 /* stream */));
-
-    CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
-
-#if !NDEBUG
-    bool printElements = false;
-    bool storeElements = false;
-
-    if(printElements || storeElements)
-    {
-        CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
-    }
-
-    if(printElements)
-    {
-        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor A elements:\n";
-            hiptensorPrintArrayElements(std::cout, A, elementsA);
-            std::cout << std::endl;
-        }
-
-        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor B elements:\n";
-            hiptensorPrintArrayElements(std::cout, B, elementsB);
-            std::cout << std::endl;
-        }
-
-        if(elementsD < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor D elements:\n";
-            hiptensorPrintArrayElements(std::cout, D, elementsD);
-            std::cout << std::endl;
-        }
-    }
-
-    if(storeElements)
-    {
-        std::ofstream tensorA, tensorB, tensorD;
-        tensorA.open("tensor_A.txt");
-        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
-        tensorA.close();
-
-        tensorB.open("tensor_B.txt");
-        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
-        tensorB.close();
-
-        tensorD.open("tensor_D_scale_contraction_results.txt");
-        hiptensorPrintElementsToFile(tensorD, D, elementsD, ", ");
-        tensorD.close();
-    }
-
-#endif
-
-    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
-
-    HIPTENSOR_FREE_HOST(A);
-    HIPTENSOR_FREE_HOST(B);
-    HIPTENSOR_FREE_HOST(D);
-
-    HIPTENSOR_FREE_DEVICE(A_d);
-    HIPTENSOR_FREE_DEVICE(B_d);
-    HIPTENSOR_FREE_DEVICE(D_d);
-    HIPTENSOR_FREE_DEVICE(workspace);
-
-    std::cout << "Finished!" << std::endl;
-
-    return 0;
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeD       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  floatTypeCompute,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>();
 }
diff --git a/samples/01_contraction/simple_scale_contraction_f32_bf16.cpp b/samples/01_contraction/simple_scale_contraction_f32_bf16.cpp
new file mode 100644
index 00000000..c11b8ded
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_f32_bf16.cpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "simple_scale_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef float        ADataType;
+    typedef float        BDataType;
+    typedef float        DDataType;
+    typedef hip_bfloat16 floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeD       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF;
+
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  floatTypeCompute,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>();
+}
diff --git a/samples/01_contraction/simple_scale_contraction_f32_f16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f16.cpp
new file mode 100644
index 00000000..377ee707
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_f32_f16.cpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "simple_scale_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef float    ADataType;
+    typedef float    BDataType;
+    typedef float    DDataType;
+    typedef _Float16 floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeD       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F;
+
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  floatTypeCompute,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>();
+}
diff --git a/samples/01_contraction/simple_scale_contraction_f64.cpp b/samples/01_contraction/simple_scale_contraction_f64.cpp
new file mode 100644
index 00000000..5eb94c15
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_f64.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_scale_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF64Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef double ADataType;
+    typedef double BDataType;
+    typedef double DDataType;
+    typedef double floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_64F;
+    constexpr hipDataType            typeB       = HIP_R_64F;
+    constexpr hipDataType            typeD       = HIP_R_64F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F;
+
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  floatTypeCompute,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>();
+}
diff --git a/samples/01_contraction/simple_scale_contraction_f64_f32.cpp b/samples/01_contraction/simple_scale_contraction_f64_f32.cpp
new file mode 100644
index 00000000..fdec48ab
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_f64_f32.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_scale_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF64Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef double ADataType;
+    typedef double BDataType;
+    typedef double DDataType;
+    typedef float  floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_64F;
+    constexpr hipDataType            typeB       = HIP_R_64F;
+    constexpr hipDataType            typeD       = HIP_R_64F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  floatTypeCompute,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>();
+}
diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml
index a08065a0..08ddf0b2 100644
--- a/test/01_contraction/configs/bilinear_test_params.yaml
+++ b/test/01_contraction/configs/bilinear_test_params.yaml
@@ -3,8 +3,11 @@ Log Level:       [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ]
 Tensor Data Types:
   - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ]
   - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ]
-  - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F]
-  - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F]
+  - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ]
+  - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16F ]
+  - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ]
+  - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ]
+  - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ]
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml
index b28e9a88..08ddf0b2 100644
--- a/test/01_contraction/configs/scale_test_params.yaml
+++ b/test/01_contraction/configs/scale_test_params.yaml
@@ -4,7 +4,10 @@ Tensor Data Types:
   - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ]
   - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ]
   - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ]
+  - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16F ]
+  - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ]
   - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ]
+  - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ]
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp
index 9446157f..ce67278f 100644
--- a/test/01_contraction/contraction_test.cpp
+++ b/test/01_contraction/contraction_test.cpp
@@ -125,6 +125,9 @@ namespace hiptensor
                     || (CDataType == NONE_TYPE));
         EXPECT_TRUE((DDataType == HIP_R_16F) || (DDataType == HIP_R_16BF)
                     || (DDataType == HIP_R_32F) || (DDataType == HIP_R_64F));
+        EXPECT_TRUE(
+            (computeType == HIPTENSOR_COMPUTE_16F) || (computeType == HIPTENSOR_COMPUTE_16BF)
+            || (computeType == HIPTENSOR_COMPUTE_32F) || (computeType == HIPTENSOR_COMPUTE_64F));
 
         mRunFlag &= checkDevice(DDataType);
 
@@ -488,7 +491,11 @@ namespace hiptensor
             auto CDataType = testType[2];
             auto DDataType = testType[3];
 
-            auto computeType = convertToComputeType(testType[4]);
+            auto   computeType = convertToComputeType(testType[4]);
+            double alphaBuf    = 0.;
+            double betaBuf     = 0.;
+            writeVal(&alphaBuf, computeType, alpha);
+            writeVal(&betaBuf, computeType, beta);
 
             CHECK_HIPTENSOR_ERROR(
                 hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
@@ -497,20 +504,21 @@ namespace hiptensor
 
             CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
                                                        &plan,
-                                                       (void*)&alpha,
+                                                       (void*)&alphaBuf,
                                                        resource->deviceA().get(),
                                                        resource->deviceB().get(),
-                                                       (void*)&beta,
+                                                       (void*)&betaBuf,
                                                        resource->deviceC().get(),
                                                        resource->deviceD().get(),
                                                        workspace,
                                                        worksize,
                                                        0 /* stream */));
 
-            CHECK_HIPTENSOR_ERROR(hiptensorContractionReference((void*)&alpha,
+            CHECK_HIPTENSOR_ERROR(hiptensorContractionReference(&plan,
+                                                                (void*)&alphaBuf,
                                                                 resource->hostA().get(),
                                                                 resource->hostB().get(),
-                                                                (void*)&beta,
+                                                                (void*)&betaBuf,
                                                                 resource->hostC().get(),
                                                                 resource->hostD().get(),
                                                                 a_ms_ks.mLengths,

From ab8d557e0e68d29c5d3b17020c5c43ef898ede8f Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Thu, 30 Nov 2023 18:46:46 +0000
Subject: [PATCH 03/12] Add placeholder for solution unique_id

Solution unique_ids of Actor Critic are have not been ready yet, but we
put some placeholders in the new Actor Critic to make the unit tests be
able to pass.
---
 .../src/contraction/contraction_selection.cpp  | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp
index 888ef4c1..68c748b0 100644
--- a/library/src/contraction/contraction_selection.cpp
+++ b/library/src/contraction/contraction_selection.cpp
@@ -72,8 +72,8 @@ namespace hiptensor
                      * hipDataTypeSize(typeE);
 
         void * A_d, *B_d, *D_d, *E_d, *wspace;
-        double alpha = 0.0d;
-        double beta  = 0.0d;
+        double alpha = 0.0;
+        double beta  = 0.0;
         writeVal(&alpha, computeType, 1.02);
         writeVal(&beta, computeType, 1.03);
 
@@ -188,7 +188,7 @@ namespace hiptensor
             size_t unique_id = 0;
 
             // TODO select unique_id
-            unique_id = 7255639152084218514;
+            unique_id = 7255639152084218514ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -237,7 +237,7 @@ namespace hiptensor
             size_t unique_id = 0;
 
             // TODO select unique_id
-            unique_id = 7255639152084218514;
+            unique_id = 7255639152084218514ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -286,7 +286,7 @@ namespace hiptensor
             size_t unique_id = 0;
 
             // TODO select unique_id
-            unique_id = 8689089455041651212;
+            unique_id = 8689089455041651212ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -335,7 +335,7 @@ namespace hiptensor
             size_t unique_id = 0;
 
             // TODO select unique_id
-            unique_id = 8689089455041651212;
+            unique_id = 8689089455041651212ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -379,6 +379,7 @@ namespace hiptensor
             size_t unique_id = 0;
 
             // TODO select unique_id
+            unique_id = 1078559130597702989ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -421,6 +422,7 @@ namespace hiptensor
 
             size_t unique_id = 0;
             // TODO select unique_id
+            unique_id = 6506383527825239632ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -463,6 +465,7 @@ namespace hiptensor
 
             size_t unique_id = 0;
             // TODO select unique_id
+            unique_id = 14486135440731032454ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -510,6 +513,7 @@ namespace hiptensor
 
             size_t unique_id = 0;
             // TODO select unique_id
+            unique_id = 11931735240548010466ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -1276,6 +1280,7 @@ namespace hiptensor
 
             size_t unique_id = 0;
             // TODO select unique_id
+            unique_id = 11912251726020349830ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -1317,6 +1322,7 @@ namespace hiptensor
             int d6 = a_ms_ks_lengths[3];
 
             size_t unique_id = 0;
+            unique_id        = 15375432626310194825ull;
             // TODO select unique_id
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())

From df27e326d15a65118a657b04c63eef37ecde946e Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Mon, 4 Dec 2023 16:12:44 +0000
Subject: [PATCH 04/12] Update contraction device instances

Update contraction device instances since CK has updated them.
---
 ..._shuffle_f32_f32_f32_f32_kknn_instance.cpp | 62 ++++++-----------
 ..._shuffle_f32_f32_f32_f32_knnn_instance.cpp | 65 ++++++------------
 ..._shuffle_f32_f32_f32_f32_mknn_instance.cpp | 65 ++++++------------
 ..._shuffle_f32_f32_f32_f32_mnnn_instance.cpp | 65 ++++++------------
 ..._shuffle_f64_f64_f64_f64_kknn_instance.cpp | 59 ++++++----------
 ..._shuffle_f64_f64_f64_f64_knnn_instance.cpp | 59 ++++++----------
 ..._shuffle_f64_f64_f64_f64_mknn_instance.cpp | 59 ++++++----------
 ..._shuffle_f64_f64_f64_f64_mnnn_instance.cpp | 59 ++++++----------
 ...xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp | 65 +++++++-----------
 ...xdl_c_shuffle_f32_f32_f32_knn_instance.cpp | 68 +++++++------------
 ...xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp | 68 +++++++------------
 ...xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp | 68 +++++++------------
 ...e_f64_f64_f64_compute_f32_kkn_instance.cpp | 27 +++++++-
 ...e_f64_f64_f64_compute_f32_knn_instance.cpp | 27 +++++++-
 ...e_f64_f64_f64_compute_f32_mkn_instance.cpp | 27 +++++++-
 ...e_f64_f64_f64_compute_f32_mnn_instance.cpp | 27 +++++++-
 ...xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp | 58 ++++++----------
 ...xdl_c_shuffle_f64_f64_f64_knn_instance.cpp | 58 ++++++----------
 ...xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp | 58 ++++++----------
 ...xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp | 58 ++++++----------
 20 files changed, 460 insertions(+), 642 deletions(-)

diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
index d8b80eb9..f924889f 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,42 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32       = float;
-                using F32_Tuple = ck::Tuple<F32>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
@@ -89,8 +71,8 @@ namespace ck
                                                                                F32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
index 5444adc3..ad94eb1f 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32       = float;
-                using F32_Tuple = ck::Tuple<F32>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   1,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   1,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   1,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
@@ -92,8 +71,8 @@ namespace ck
                                                                                F32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
index b20c1204..8fb870a0 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32       = float;
-                using F32_Tuple = ck::Tuple<F32>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|  CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|          _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|          _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                              |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   4,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
@@ -92,8 +71,8 @@ namespace ck
                                                                                F32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
index 2bc3d1f2..aa3e9d32 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32       = float;
-                using F32_Tuple = ck::Tuple<F32>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   1,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
@@ -92,8 +71,8 @@ namespace ck
                                                                                F32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
index a1fe1ddf..a65ae1eb 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64       = double;
-                using F64_Tuple = ck::Tuple<F64>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    32,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    64,    16,   2,   2,   16,   16,    2,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_kk_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
@@ -86,8 +71,8 @@ namespace ck
                                                                                F64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
index a635bce8..4d6ccaa8 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64       = double;
-                using F64_Tuple = ck::Tuple<F64>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   1,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   1,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_kn_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
@@ -86,8 +71,8 @@ namespace ck
                                                                                F64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
index c77ffea4..071ccf62 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64       = double;
-                using F64_Tuple = ck::Tuple<F64>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|  CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|          _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|          _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                              |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   2,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   2,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_mk_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
@@ -86,8 +71,8 @@ namespace ck
                                                                                F64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
index c8a96a70..d8223df7 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64       = double;
-                using F64_Tuple = ck::Tuple<F64>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_mn_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
@@ -86,8 +71,8 @@ namespace ck
                                                                                F64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
index 88345e74..24d2d570 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,42 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32         = float;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
-                // k/k/n are the fast changing dimension for A/B/E
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -88,7 +70,8 @@ namespace ck
                                                                            F32,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
index 38702afd..f559dc06 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32         = float;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
-                // k/n/n are the fast changing dimension for A/B/E
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   1,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   1,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   1,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -91,7 +70,8 @@ namespace ck
                                                                            F32,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
index 735a5e34..a522052d 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32         = float;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
-                // m/k/n are the fast changing dimension for A/B/E
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|  CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|          _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|          _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                              |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   4,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -91,7 +70,8 @@ namespace ck
                                                                            F32,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
index d286e2d8..be35683b 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32         = float;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
-                // m/n/n are the fast changing dimension for A/B/E
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   1,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -91,7 +70,8 @@ namespace ck
                                                                            F32,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
index 04176d80..dac46620 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
index 06481fc7..0830b49f 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
index 94922008..9a716ba3 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
index e70b854b..e02ac144 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
index f8904a8f..6f168ee2 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64         = double;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    64,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    32,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    64,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    32,    64,    16,   2,   2,   16,   16,    2,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_kk_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -85,7 +70,8 @@ namespace ck
                                                                            F64,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
index 56fc8b91..347a810c 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64         = double;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   1,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   1,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_kn_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -85,7 +70,8 @@ namespace ck
                                                                            F64,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
index 231a0256..229d18c7 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64         = double;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|  CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|          _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|          _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                              |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   2,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   2,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_mk_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -85,7 +70,8 @@ namespace ck
                                                                            F64,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
index 4fc648d4..bf1efa14 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64         = double;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_mn_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -85,7 +70,8 @@ namespace ck
                                                                            F64,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,

From f85df837f3ae885178b197f8c2435c14e9847a2c Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Mon, 4 Dec 2023 16:29:00 +0000
Subject: [PATCH 05/12] Print C in sample output

1. Initiate the data with 0.01, 0.02, ... by default
2. Print C
---
 .../simple_bilinear_contraction.hpp           | 22 ++++++++++++++-----
 .../simple_scale_contraction.hpp              |  2 +-
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/samples/01_contraction/simple_bilinear_contraction.hpp b/samples/01_contraction/simple_bilinear_contraction.hpp
index aaef4a1b..27001232 100644
--- a/samples/01_contraction/simple_bilinear_contraction.hpp
+++ b/samples/01_contraction/simple_bilinear_contraction.hpp
@@ -154,7 +154,7 @@ int bilinearContractionSample()
     /*******************
    * Initialize data
    *******************/
-    int initMethod = 0; // TODO read value from commandline
+    int initMethod = 1; // TODO read value from commandline
     for(int64_t i = 0; i < elementsA; i++)
     {
         if(initMethod == 0)
@@ -287,11 +287,6 @@ int bilinearContractionSample()
     bool printElements = false;
     bool storeElements = false;
 
-    if(printElements || storeElements)
-    {
-        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
-    }
-
     if(printElements)
     {
         if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
@@ -314,6 +309,15 @@ int bilinearContractionSample()
             hiptensorPrintArrayElements(std::cout, C, elementsC);
             std::cout << std::endl;
         }
+
+        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
+
+        if(elementsC < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor D elements:\n";
+            hiptensorPrintArrayElements(std::cout, C, elementsC);
+            std::cout << std::endl;
+        }
     }
 
     if(storeElements)
@@ -327,6 +331,12 @@ int bilinearContractionSample()
         hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
         tensorB.close();
 
+        tensorC.open("tensor_C.txt");
+        hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
+        tensorC.close();
+
+        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
+
         tensorC.open("tensor_C_scale_contraction_results.txt");
         hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
         tensorC.close();
diff --git a/samples/01_contraction/simple_scale_contraction.hpp b/samples/01_contraction/simple_scale_contraction.hpp
index e9d482c3..78b026b6 100644
--- a/samples/01_contraction/simple_scale_contraction.hpp
+++ b/samples/01_contraction/simple_scale_contraction.hpp
@@ -151,7 +151,7 @@ int scaleContractionSample()
     /*******************
    * Initialize data
    *******************/
-    int initMethod = 0; // TODO read the value from command line
+    int initMethod = 1; // TODO read the value from command line
     for(int64_t i = 0; i < elementsA; i++)
     {
         if(initMethod == 0)

From 5c45a8c80dd0e90171a791bd945c0e41b84ef22d Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Tue, 5 Dec 2023 18:15:54 +0000
Subject: [PATCH 06/12] Set CK contraction instance only run once

When logger level is set to HIPTENSOR_LOG_LEVEL_PERF_TRACE, we make CK
instances measure the running time. The problem is that CK internally
will run the contraction 10 times by default. This leads to an issues:

1. It returns wrong result for C = alpha A x B + beta C

Set StreamConfig.nrepeat_ = 1, the contraction will be run once
---
 library/src/contraction/hiptensor_contraction.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp
index c7b7501b..8148eeaa 100644
--- a/library/src/contraction/hiptensor_contraction.cpp
+++ b/library/src/contraction/hiptensor_contraction.cpp
@@ -720,7 +720,13 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         // Perform contraction with timing if LOG_LEVEL_PERF_TRACE
         if(logger->getLogMask() & HIPTENSOR_LOG_LEVEL_PERF_TRACE)
         {
-            auto time = (*cSolution)(StreamConfig{stream, true});
+            auto time = (*cSolution)(StreamConfig{
+                stream, // stream id
+                true, // time_kernel
+                0, // log_level
+                0, // cold_niters
+                1, // nrepeat
+            });
             if(time < 0)
             {
                 return HIPTENSOR_STATUS_CK_ERROR;

From f631818937db143e42d444d4a0c2ce5646ad525e Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Tue, 5 Dec 2023 23:57:33 +0000
Subject: [PATCH 07/12] Fixed a bug in CPU reference

1. ck::bhalf_t cannot cast to float or double by static_cast.
Use ck::type_convert() to fix it.

2. epsilon() is not good value to measure the relative difference of
data. It is too small for double (eps < 10e-13).
---
 .../contraction_cpu_reference_impl.hpp          | 17 +++++++----------
 .../configs/bilinear_test_params.yaml           |  2 +-
 .../configs/scale_test_params.yaml              |  2 +-
 test/utils.hpp                                  |  9 ++++-----
 4 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp
index ac4fc20d..a9a9d176 100644
--- a/library/src/contraction/contraction_cpu_reference_impl.hpp
+++ b/library/src/contraction/contraction_cpu_reference_impl.hpp
@@ -53,7 +53,6 @@ namespace hiptensor
         typename BDataType,
         typename DsDataType,
         typename EDataType,
-        typename AccDataType,
         typename AElementwiseOperation,
         typename BElementwiseOperation,
         typename CDEElementwiseOperation,
@@ -152,7 +151,7 @@ namespace hiptensor
                 };
 
                 auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                    auto accum = static_cast<AccDataType>(0);
+                    float accum = 0.0f;
 
                     auto K0 = arg.mA_ms_ks_lengths[2];
                     auto K1 = arg.mA_ms_ks_lengths[3];
@@ -174,8 +173,7 @@ namespace hiptensor
                             arg.mOpB(valB, ((BDataType*)arg.mB)[indexB]);
 
                             // Mult / accum
-                            accum
-                                += static_cast<AccDataType>(valA) * static_cast<AccDataType>(valB);
+                            accum += ck::type_convert<float>(valA) * ck::type_convert<float>(valB);
                         }
                     }
 
@@ -184,15 +182,17 @@ namespace hiptensor
                     if constexpr(std::is_same_v<CDEElementwiseOperation,
                                                 ck::tensor_operation::element_wise::Scale>)
                     {
-                        arg.mOpCDE(((EDataType*)arg.mE)[indexE], accum);
+                        arg.mOpCDE(((EDataType*)arg.mE)[indexE],
+                                   ck::type_convert<EDataType>(accum));
                     }
                     else // bilinear
                     {
                         // NumDTensor will be 1 due to SFINAE of this class
                         auto indexD
                             = offset(std::vector<size_t>{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]);
-                        arg.mOpCDE(
-                            ((EDataType*)arg.mE)[indexE], accum, ((EDataType*)(arg.mD[0]))[indexD]);
+                        arg.mOpCDE(((EDataType*)arg.mE)[indexE],
+                                   ck::type_convert<EDataType>(accum),
+                                   ((EDataType*)(arg.mD[0]))[indexD]);
                     }
                 };
 
@@ -323,7 +323,6 @@ namespace hiptensor
               typename BDataType,
               typename DsDataType,
               typename EDataType,
-              typename AccumDataType,
               typename AElementwiseOperation,
               typename BElementwiseOperation,
               typename CDEElementwiseOperation,
@@ -335,7 +334,6 @@ namespace hiptensor
                                                     BDataType,
                                                     DsDataType,
                                                     EDataType,
-                                                    AccumDataType,
                                                     AElementwiseOperation,
                                                     BElementwiseOperation,
                                                     CDEElementwiseOperation,
@@ -375,7 +373,6 @@ namespace hiptensor
                                                           BDataType,
                                                           DsDataType,
                                                           EDataType,
-                                                          EDataType,
                                                           AElementwiseOperation,
                                                           BElementwiseOperation,
                                                           CDEElementwiseOperation,
diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml
index 08ddf0b2..eee5d7f1 100644
--- a/test/01_contraction/configs/bilinear_test_params.yaml
+++ b/test/01_contraction/configs/bilinear_test_params.yaml
@@ -11,7 +11,7 @@ Tensor Data Types:
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
-  - HIPTENSOR_ALGO_ACTOR_CRITIC
+  # - HIPTENSOR_ALGO_ACTOR_CRITIC
 Operators:
   - HIPTENSOR_OP_IDENTITY
 Worksize Prefs:
diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml
index 08ddf0b2..eee5d7f1 100644
--- a/test/01_contraction/configs/scale_test_params.yaml
+++ b/test/01_contraction/configs/scale_test_params.yaml
@@ -11,7 +11,7 @@ Tensor Data Types:
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
-  - HIPTENSOR_ALGO_ACTOR_CRITIC
+  # - HIPTENSOR_ALGO_ACTOR_CRITIC
 Operators:
   - HIPTENSOR_OP_IDENTITY
 Worksize Prefs:
diff --git a/test/utils.hpp b/test/utils.hpp
index ad4bb565..05daf544 100644
--- a/test/utils.hpp
+++ b/test/utils.hpp
@@ -140,7 +140,7 @@ template <typename DDataType>
 std::pair<bool, double> compareEqual(DDataType const* deviceD,
                                      DDataType const* hostD,
                                      std::size_t      elementsD,
-                                     double           tolerance = 100.0)
+                                     double           tolerance = 0.005)
 {
     bool   retval             = true;
     double max_relative_error = 0.0;
@@ -202,7 +202,7 @@ std::pair<bool, double> compareEqual(DDataType const* deviceD,
         retval             = false;
         max_relative_error = std::numeric_limits<DDataType>::signaling_NaN();
     }
-    else if(max_relative_error > (eps * tolerance))
+    else if(max_relative_error > tolerance)
     {
         retval = false;
     }
@@ -214,7 +214,7 @@ template <typename DDataType>
 std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
                                                  DDataType*  hostD,
                                                  std::size_t elementsD,
-                                                 double      tolerance = 100.0)
+                                                 double      tolerance = 0.005)
 {
     auto blockDim = dim3(1024, 1, 1);
     auto gridDim  = dim3(ceilDiv(elementsD, blockDim.x), 1, 1);
@@ -276,13 +276,12 @@ std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
     auto toDouble
         = [](DDataType const& val) { return static_cast<double>(static_cast<float>(val)); };
 
-    auto eps = toDouble(std::numeric_limits<DDataType>::epsilon());
     if(isNaN)
     {
         retval           = false;
         maxRelativeError = std::numeric_limits<DDataType>::signaling_NaN();
     }
-    else if(maxRelativeError > (eps * tolerance))
+    else if(maxRelativeError > tolerance)
     {
         retval = false;
     }

From e5cefe79a7e4630b4e1f07edd425a6cba6fda519 Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Wed, 6 Dec 2023 01:43:48 +0000
Subject: [PATCH 08/12] Add commnets

---
 library/src/contraction/contraction_meta_traits.hpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp
index 6a7cb35f..e66ac432 100644
--- a/library/src/contraction/contraction_meta_traits.hpp
+++ b/library/src/contraction/contraction_meta_traits.hpp
@@ -67,6 +67,14 @@ namespace hiptensor
         constexpr static ck::index_t DimsM = NumDimsM;
         constexpr static ck::index_t DimsN = NumDimsN;
         constexpr static ck::index_t DimsK = NumDimsK;
+        /*
+         * CK does not use hip_bfloat16, instead it use ushort(ck::bhalf_t) for cuda bhalf_t type.
+         * What we want here is that we can use ck::bhalf_t with ck instances and use hip_bfloat16
+         * with hiptensor classes.
+         *
+         * When creating a solution, ck::bhalf_t was passed in to create ck instance.
+         * When registering the solution, MetaTraits will returen hip_bfloat16 to create key.
+         */
         using ADataT
             = std::conditional_t<std::is_same_v<ADataType, ck::bhalf_t>, hip_bfloat16, ADataType>;
         using BDataT

From 4345a1c5b4b32fa427a8880a944895b3947ee6dd Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Wed, 6 Dec 2023 17:14:53 +0000
Subject: [PATCH 09/12] Rename contraction sameple files

The pattern of contraction sameple file is

- bilinear: simple_bilinear_contraction_<A>_<B>_<C>_<D>_compute_<compute>.cpp
- scale   : simple_scale_contraction_<A>_<B>_<C>_compute_<compute>.cpp
---
 samples/01_contraction/CMakeLists.txt         | 85 ++++++++++---------
 ...tion_bf16_bf16_bf16_bf16_compute_bf16.cpp} |  0
 ...ntraction_f16_f16_f16_f16_compute_f16.cpp} |  0
 ...traction_f32_f32_f32_f32_compute_bf16.cpp} |  0
 ...ntraction_f32_f32_f32_f32_compute_f16.cpp} |  0
 ...ntraction_f32_f32_f32_f32_compute_f32.cpp} |  0
 ...ntraction_f64_f64_f64_f64_compute_f32.cpp} |  0
 ...ntraction_f64_f64_f64_f64_compute_f64.cpp} |  0
 ...ntraction_bf16_bf16_bf16_compute_bf16.cpp} |  0
 ...e_contraction_f16_f16_f16_compute_f16.cpp} |  0
 ..._contraction_f32_f32_f32_compute_bf16.cpp} |  0
 ...e_contraction_f32_f32_f32_compute_f16.cpp} |  0
 ...e_contraction_f32_f32_f32_compute_f32.cpp} |  0
 ...e_contraction_f64_f64_f64_compute_f32.cpp} |  0
 ...e_contraction_f64_f64_f64_compute_f64.cpp} |  0
 15 files changed, 43 insertions(+), 42 deletions(-)
 rename samples/01_contraction/{simple_bilinear_contraction_bf16.cpp => simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp} (100%)
 rename samples/01_contraction/{simple_bilinear_contraction_f16.cpp => simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp} (100%)
 rename samples/01_contraction/{simple_bilinear_contraction_f32_bf16.cpp => simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp} (100%)
 rename samples/01_contraction/{simple_bilinear_contraction_f32_f16.cpp => simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp} (100%)
 rename samples/01_contraction/{simple_bilinear_contraction_f32.cpp => simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp} (100%)
 rename samples/01_contraction/{simple_bilinear_contraction_f64_f32.cpp => simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp} (100%)
 rename samples/01_contraction/{simple_bilinear_contraction_f64.cpp => simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp} (100%)
 rename samples/01_contraction/{simple_scale_contraction_bf16.cpp => simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp} (100%)
 rename samples/01_contraction/{simple_scale_contraction_f16.cpp => simple_scale_contraction_f16_f16_f16_compute_f16.cpp} (100%)
 rename samples/01_contraction/{simple_scale_contraction_f32_bf16.cpp => simple_scale_contraction_f32_f32_f32_compute_bf16.cpp} (100%)
 rename samples/01_contraction/{simple_scale_contraction_f32_f16.cpp => simple_scale_contraction_f32_f32_f32_compute_f16.cpp} (100%)
 rename samples/01_contraction/{simple_scale_contraction_f32.cpp => simple_scale_contraction_f32_f32_f32_compute_f32.cpp} (100%)
 rename samples/01_contraction/{simple_scale_contraction_f64_f32.cpp => simple_scale_contraction_f64_f64_f64_compute_f32.cpp} (100%)
 rename samples/01_contraction/{simple_scale_contraction_f64.cpp => simple_scale_contraction_f64_f64_f64_compute_f64.cpp} (100%)

diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt
index de834d72..00393f1d 100644
--- a/samples/01_contraction/CMakeLists.txt
+++ b/samples/01_contraction/CMakeLists.txt
@@ -26,62 +26,63 @@
 
 # Check whether building within hiptensor context
 if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
-    add_hiptensor_sample(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp)
-    add_hiptensor_sample(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp)
-    add_hiptensor_sample(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp)
-    add_hiptensor_sample(simple_contraction_scale_f64 simple_scale_contraction_f64.cpp)
-    add_hiptensor_sample(simple_contraction_scale_f64_f32 simple_scale_contraction_f64_f32.cpp)
-    add_hiptensor_sample(simple_contraction_scale_f32_f16 simple_scale_contraction_f32_f16.cpp)
-    add_hiptensor_sample(simple_contraction_scale_f32_bf16 simple_scale_contraction_f32_bf16.cpp)
-    add_hiptensor_sample(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp)
-    add_hiptensor_sample(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp)
-    add_hiptensor_sample(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp)
-    add_hiptensor_sample(simple_contraction_bilinear_f64 simple_bilinear_contraction_f64.cpp)
-    add_hiptensor_sample(simple_contraction_bilinear_f64_f32 simple_bilinear_contraction_f64_f32.cpp)
-    add_hiptensor_sample(simple_contraction_bilinear_f32_f16 simple_bilinear_contraction_f32_f16.cpp)
-    add_hiptensor_sample(simple_contraction_bilinear_f32_bf16 simple_bilinear_contraction_f32_bf16.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp)
+    add_hiptensor_sample(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f16_f16_f16_compute_f16 simple_scale_contraction_f16_f16_f16_compute_f16.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp)
 
 # If building hipTensor samples as a standalone Cmake project
 else()
-    add_executable(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp)
-    target_link_libraries(simple_contraction_scale_f16 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp)
+    target_link_libraries(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp)
-    target_link_libraries(simple_contraction_scale_bf16 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp)
+    target_link_libraries(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp)
-    target_link_libraries(simple_contraction_scale_f32 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp)
+    target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_scale_f32_bf16 simple_scale_contraction_f32_bf16.cpp)
-    target_link_libraries(simple_contraction_scale_f32_bf16 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp)
+    target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_scale_f32_f16 simple_scale_contraction_f32_f16.cpp)
-    target_link_libraries(simple_contraction_scale_f32_f16 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp)
+    target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_scale_f64 simple_scale_contraction_f64.cpp)
-    target_link_libraries(simple_contraction_scale_f64 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp)
+    target_link_libraries(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_scale_f64_f32 simple_scale_contraction_f64_f32.cpp)
-    target_link_libraries(simple_contraction_scale_f64_f32 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp)
+    target_link_libraries(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp)
-    target_link_libraries(simple_contraction_bilinear_f16 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp)
+    target_link_libraries(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp)
-    target_link_libraries(simple_contraction_bilinear_bf16 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_scale_contraction_f16_f16_f16_compute_f16 simple_scale_contraction_f16_f16_f16_compute_f16.cpp)
+    target_link_libraries(simple_scale_contraction_f16_f16_f16_compute_f16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp)
-    target_link_libraries(simple_contraction_bilinear_f32 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp)
+    target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_bf16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_bilinear_f32_bf16 simple_bilinear_contraction_f32_bf16.cpp)
-    target_link_libraries(simple_contraction_bilinear_f32_bf16 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp)
+    target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_f16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_bilinear_f32_f16 simple_bilinear_contraction_f32_f16.cpp)
-    target_link_libraries(simple_contraction_bilinear_f32_f16 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp)
+    target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_f32 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_bilinear_f64 simple_bilinear_contraction_f64.cpp)
-    target_link_libraries(simple_contraction_bilinear_f64 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp)
+    target_link_libraries(simple_scale_contraction_f64_f64_f64_compute_f32 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp)
+    target_link_libraries(simple_scale_contraction_f64_f64_f64_compute_f64 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_bilinear_f64_f32 simple_bilinear_contraction_f64_f32.cpp)
-    target_link_libraries(simple_contraction_bilinear_f64_f32 PRIVATE hiptensor::hiptensor)
 endif()
diff --git a/samples/01_contraction/simple_bilinear_contraction_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp
similarity index 100%
rename from samples/01_contraction/simple_bilinear_contraction_bf16.cpp
rename to samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp
diff --git a/samples/01_contraction/simple_bilinear_contraction_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp
similarity index 100%
rename from samples/01_contraction/simple_bilinear_contraction_f16.cpp
rename to samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp
similarity index 100%
rename from samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp
rename to samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp
similarity index 100%
rename from samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp
rename to samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp
similarity index 100%
rename from samples/01_contraction/simple_bilinear_contraction_f32.cpp
rename to samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp
diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp
similarity index 100%
rename from samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp
rename to samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp
diff --git a/samples/01_contraction/simple_bilinear_contraction_f64.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp
similarity index 100%
rename from samples/01_contraction/simple_bilinear_contraction_f64.cpp
rename to samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp
diff --git a/samples/01_contraction/simple_scale_contraction_bf16.cpp b/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp
similarity index 100%
rename from samples/01_contraction/simple_scale_contraction_bf16.cpp
rename to samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp
diff --git a/samples/01_contraction/simple_scale_contraction_f16.cpp b/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp
similarity index 100%
rename from samples/01_contraction/simple_scale_contraction_f16.cpp
rename to samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp
diff --git a/samples/01_contraction/simple_scale_contraction_f32_bf16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp
similarity index 100%
rename from samples/01_contraction/simple_scale_contraction_f32_bf16.cpp
rename to samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp
diff --git a/samples/01_contraction/simple_scale_contraction_f32_f16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp
similarity index 100%
rename from samples/01_contraction/simple_scale_contraction_f32_f16.cpp
rename to samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp
diff --git a/samples/01_contraction/simple_scale_contraction_f32.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp
similarity index 100%
rename from samples/01_contraction/simple_scale_contraction_f32.cpp
rename to samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp
diff --git a/samples/01_contraction/simple_scale_contraction_f64_f32.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp
similarity index 100%
rename from samples/01_contraction/simple_scale_contraction_f64_f32.cpp
rename to samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp
diff --git a/samples/01_contraction/simple_scale_contraction_f64.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp
similarity index 100%
rename from samples/01_contraction/simple_scale_contraction_f64.cpp
rename to samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp

From 43f33ee5c6b40d0b4278cd1c221399eb99b16a7d Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Wed, 6 Dec 2023 21:02:53 +0000
Subject: [PATCH 10/12] Improve CPU reference accurary

The relative difference between contraction result and CPU reference is
less than 0.1% after the improvement.
---
 library/src/contraction/contraction_cpu_reference_impl.hpp | 3 ++-
 test/utils.hpp                                             | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp
index a9a9d176..d21df2d3 100644
--- a/library/src/contraction/contraction_cpu_reference_impl.hpp
+++ b/library/src/contraction/contraction_cpu_reference_impl.hpp
@@ -173,7 +173,8 @@ namespace hiptensor
                             arg.mOpB(valB, ((BDataType*)arg.mB)[indexB]);
 
                             // Mult / accum
-                            accum += ck::type_convert<float>(valA) * ck::type_convert<float>(valB);
+                            accum += ck::type_convert<float>(ck::type_convert<ComputeDataType>(
+                                ck::type_convert<float>(valA) * ck::type_convert<float>(valB)));
                         }
                     }
 
diff --git a/test/utils.hpp b/test/utils.hpp
index 05daf544..f39f0fb5 100644
--- a/test/utils.hpp
+++ b/test/utils.hpp
@@ -140,7 +140,7 @@ template <typename DDataType>
 std::pair<bool, double> compareEqual(DDataType const* deviceD,
                                      DDataType const* hostD,
                                      std::size_t      elementsD,
-                                     double           tolerance = 0.005)
+                                     double           tolerance = 0.001)
 {
     bool   retval             = true;
     double max_relative_error = 0.0;
@@ -214,7 +214,7 @@ template <typename DDataType>
 std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
                                                  DDataType*  hostD,
                                                  std::size_t elementsD,
-                                                 double      tolerance = 0.005)
+                                                 double      tolerance = 0.001)
 {
     auto blockDim = dim3(1024, 1, 1);
     auto gridDim  = dim3(ceilDiv(elementsD, blockDim.x), 1, 1);

From fec9065460d2205f9b9478ccd5f69fa51d2a839e Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Wed, 6 Dec 2023 21:19:36 +0000
Subject: [PATCH 11/12] Add comments to explain how to pass alpha value

---
 library/src/contraction/contraction_selection.cpp | 11 ++++++++++-
 test/01_contraction/contraction_test.cpp          | 15 ++++++++++++---
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp
index 68c748b0..9b0cdf9f 100644
--- a/library/src/contraction/contraction_selection.cpp
+++ b/library/src/contraction/contraction_selection.cpp
@@ -71,7 +71,16 @@ namespace hiptensor
         auto sizeE = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides)
                      * hipDataTypeSize(typeE);
 
-        void * A_d, *B_d, *D_d, *E_d, *wspace;
+        void *A_d, *B_d, *D_d, *E_d, *wspace;
+
+        /*
+         * `alpha` and `beta` are void pointer. hiptensor uses readVal to load the value of alpha.
+         * ```
+         * alphaF = hiptensor::readVal<float>(
+         *      alpha, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
+         * ```
+         * Hence, the `alpha` and `bete` need to point to a ComputeData value
+         */
         double alpha = 0.0;
         double beta  = 0.0;
         writeVal(&alpha, computeType, 1.02);
diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp
index ce67278f..76cc3033 100644
--- a/test/01_contraction/contraction_test.cpp
+++ b/test/01_contraction/contraction_test.cpp
@@ -491,9 +491,18 @@ namespace hiptensor
             auto CDataType = testType[2];
             auto DDataType = testType[3];
 
-            auto   computeType = convertToComputeType(testType[4]);
-            double alphaBuf    = 0.;
-            double betaBuf     = 0.;
+            auto computeType = convertToComputeType(testType[4]);
+
+            /*
+             * `alpha` and `beta` are void pointer. hiptensor uses readVal to load the value of alpha.
+             * ```
+             * alphaF = hiptensor::readVal<float>(
+             *      alpha, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
+             * ```
+             * Hence, the `alpha` and `bete` need to point to a ComputeData value
+             */
+            double alphaBuf = 0.;
+            double betaBuf  = 0.;
             writeVal(&alphaBuf, computeType, alpha);
             writeVal(&betaBuf, computeType, beta);
 

From b21fe0b18881fb6ed5643be7bd2e242f9a4b45a2 Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Thu, 7 Dec 2023 02:16:22 +0000
Subject: [PATCH 12/12] Update CPU reference

1. Revert the default threshold of relative difference to (100 * std::numeric_limits<T>::epsilon())
2. Update CPU reference to make the difference between CPU reference and output of contraction instance
is less than (100 * std::numeric_limits<T>::epsilon()).
---
 .../contraction_cpu_reference_impl.hpp        | 29 ++++++++++++++-----
 .../contraction_cpu_reference_instances.cpp   | 14 +++++++++
 .../configs/bilinear_test_params.yaml         |  2 +-
 .../configs/scale_test_params.yaml            |  2 +-
 test/utils.hpp                                |  9 +++---
 5 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp
index d21df2d3..2e3d0cbe 100644
--- a/library/src/contraction/contraction_cpu_reference_impl.hpp
+++ b/library/src/contraction/contraction_cpu_reference_impl.hpp
@@ -45,19 +45,25 @@
 namespace hiptensor
 {
     // hardcoded for NumDimM == NumDimN == NumDimK == 2
+    //
+    // ck::bhalf_t is ushort, cannot perform bhalf_t * bhalf_t
+    // CK does not use ck::bhalf_t as AccDataType. But we still
+    // add this guard here
     template <
         ck::index_t NumDimM,
         ck::index_t NumDimN,
         ck::index_t NumDimK,
         typename ADataType,
         typename BDataType,
+        typename AccDataType,
         typename DsDataType,
         typename EDataType,
         typename AElementwiseOperation,
         typename BElementwiseOperation,
         typename CDEElementwiseOperation,
         typename ComputeDataType = ADataType,
-        ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2 && DsDataType::Size() <= 1,
+        ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2 && DsDataType::Size() <= 1
+                            && !std::is_same_v<AccDataType, ck::bhalf_t>,
                         bool>
         = false>
     struct ReferenceContraction_M2_N2_K2
@@ -151,7 +157,7 @@ namespace hiptensor
                 };
 
                 auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                    float accum = 0.0f;
+                    AccDataType accum = 0;
 
                     auto K0 = arg.mA_ms_ks_lengths[2];
                     auto K1 = arg.mA_ms_ks_lengths[3];
@@ -165,16 +171,19 @@ namespace hiptensor
                             auto indexB
                                 = offset(std::vector<size_t>{n0, n1, k0, k1}, arg.mB_ns_ks_strides);
 
-                            ADataType valA;
-                            BDataType valB;
+                            AccDataType valA;
+                            AccDataType valB;
 
                             // Element-wise ops
-                            arg.mOpA(valA, ((ADataType*)arg.mA)[indexA]);
-                            arg.mOpB(valB, ((BDataType*)arg.mB)[indexB]);
+                            arg.mOpA(
+                                valA,
+                                ck::type_convert<ComputeDataType>(((ADataType*)arg.mA)[indexA]));
+                            arg.mOpB(
+                                valB,
+                                ck::type_convert<ComputeDataType>(((BDataType*)arg.mB)[indexB]));
 
                             // Mult / accum
-                            accum += ck::type_convert<float>(ck::type_convert<ComputeDataType>(
-                                ck::type_convert<float>(valA) * ck::type_convert<float>(valB)));
+                            accum += valA * valB;
                         }
                     }
 
@@ -322,6 +331,7 @@ namespace hiptensor
               ck::index_t NumDimsK,
               typename ADataType,
               typename BDataType,
+              typename AccDataType,
               typename DsDataType,
               typename EDataType,
               typename AElementwiseOperation,
@@ -333,6 +343,7 @@ namespace hiptensor
                                                     NumDimsK,
                                                     ADataType,
                                                     BDataType,
+                                                    AccDataType,
                                                     DsDataType,
                                                     EDataType,
                                                     AElementwiseOperation,
@@ -359,6 +370,7 @@ namespace hiptensor
               ck::index_t NumDimK,
               typename ADataType,
               typename BDataType,
+              typename AccDataType,
               typename DsDataType,
               typename EDataType,
               typename AElementwiseOperation,
@@ -372,6 +384,7 @@ namespace hiptensor
                                                           NumDimK,
                                                           ADataType,
                                                           BDataType,
+                                                          AccDataType,
                                                           DsDataType,
                                                           EDataType,
                                                           AElementwiseOperation,
diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp
index 173a49e9..31fb0191 100644
--- a/library/src/contraction/contraction_cpu_reference_instances.cpp
+++ b/library/src/contraction/contraction_cpu_reference_instances.cpp
@@ -39,6 +39,7 @@ namespace hiptensor
                                         2,
                                         ck::half_t,
                                         ck::half_t,
+                                        float,
                                         ck::Tuple<ck::half_t>,
                                         ck::half_t,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -53,6 +54,7 @@ namespace hiptensor
                                         2,
                                         ck::bhalf_t,
                                         ck::bhalf_t,
+                                        float,
                                         ck::Tuple<ck::bhalf_t>,
                                         ck::bhalf_t,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -67,6 +69,7 @@ namespace hiptensor
                                         2,
                                         float,
                                         float,
+                                        float,
                                         ck::Tuple<float>,
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -80,6 +83,7 @@ namespace hiptensor
                                         2,
                                         float,
                                         float,
+                                        float,
                                         ck::Tuple<float>,
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -93,6 +97,7 @@ namespace hiptensor
                                         2,
                                         float,
                                         float,
+                                        float,
                                         ck::Tuple<float>,
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -107,6 +112,7 @@ namespace hiptensor
                                         2,
                                         double,
                                         double,
+                                        float,
                                         ck::Tuple<double>,
                                         double,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -120,6 +126,7 @@ namespace hiptensor
                                         2,
                                         double,
                                         double,
+                                        double,
                                         ck::Tuple<double>,
                                         double,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -134,6 +141,7 @@ namespace hiptensor
                                         2,
                                         ck::half_t,
                                         ck::half_t,
+                                        float,
                                         ck::Tuple<>,
                                         ck::half_t,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -148,6 +156,7 @@ namespace hiptensor
                                         2,
                                         ck::bhalf_t,
                                         ck::bhalf_t,
+                                        float,
                                         ck::Tuple<>,
                                         ck::bhalf_t,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -162,6 +171,7 @@ namespace hiptensor
                                         2,
                                         float,
                                         float,
+                                        float,
                                         ck::Tuple<>,
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -175,6 +185,7 @@ namespace hiptensor
                                         2,
                                         float,
                                         float,
+                                        float,
                                         ck::Tuple<>,
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -188,6 +199,7 @@ namespace hiptensor
                                         2,
                                         float,
                                         float,
+                                        float,
                                         ck::Tuple<>,
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -202,6 +214,7 @@ namespace hiptensor
                                         2,
                                         double,
                                         double,
+                                        float,
                                         ck::Tuple<>,
                                         double,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -215,6 +228,7 @@ namespace hiptensor
                                         2,
                                         double,
                                         double,
+                                        double,
                                         ck::Tuple<>,
                                         double,
                                         ck::tensor_operation::element_wise::PassThrough,
diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml
index eee5d7f1..f4be1a88 100644
--- a/test/01_contraction/configs/bilinear_test_params.yaml
+++ b/test/01_contraction/configs/bilinear_test_params.yaml
@@ -29,7 +29,7 @@ Betas:
 Lengths:
   - [ 5, 6, 3, 4, 3, 4 ]
   - [ 4, 3, 4, 3, 6, 5 ]
-  - [ 24, 18, 2, 4, 9, 1 ]
+  - [ 24, 18, 2, 4, 9, 2 ]
 Strides:
   - []
 ...
diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml
index eee5d7f1..f4be1a88 100644
--- a/test/01_contraction/configs/scale_test_params.yaml
+++ b/test/01_contraction/configs/scale_test_params.yaml
@@ -29,7 +29,7 @@ Betas:
 Lengths:
   - [ 5, 6, 3, 4, 3, 4 ]
   - [ 4, 3, 4, 3, 6, 5 ]
-  - [ 24, 18, 2, 4, 9, 1 ]
+  - [ 24, 18, 2, 4, 9, 2 ]
 Strides:
   - []
 ...
diff --git a/test/utils.hpp b/test/utils.hpp
index f39f0fb5..ad4bb565 100644
--- a/test/utils.hpp
+++ b/test/utils.hpp
@@ -140,7 +140,7 @@ template <typename DDataType>
 std::pair<bool, double> compareEqual(DDataType const* deviceD,
                                      DDataType const* hostD,
                                      std::size_t      elementsD,
-                                     double           tolerance = 0.001)
+                                     double           tolerance = 100.0)
 {
     bool   retval             = true;
     double max_relative_error = 0.0;
@@ -202,7 +202,7 @@ std::pair<bool, double> compareEqual(DDataType const* deviceD,
         retval             = false;
         max_relative_error = std::numeric_limits<DDataType>::signaling_NaN();
     }
-    else if(max_relative_error > tolerance)
+    else if(max_relative_error > (eps * tolerance))
     {
         retval = false;
     }
@@ -214,7 +214,7 @@ template <typename DDataType>
 std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
                                                  DDataType*  hostD,
                                                  std::size_t elementsD,
-                                                 double      tolerance = 0.001)
+                                                 double      tolerance = 100.0)
 {
     auto blockDim = dim3(1024, 1, 1);
     auto gridDim  = dim3(ceilDiv(elementsD, blockDim.x), 1, 1);
@@ -276,12 +276,13 @@ std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
     auto toDouble
         = [](DDataType const& val) { return static_cast<double>(static_cast<float>(val)); };
 
+    auto eps = toDouble(std::numeric_limits<DDataType>::epsilon());
     if(isNaN)
     {
         retval           = false;
         maxRelativeError = std::numeric_limits<DDataType>::signaling_NaN();
     }
-    else if(maxRelativeError > tolerance)
+    else if(maxRelativeError > (eps * tolerance))
     {
         retval = false;
     }